From 32d5a9d5a73e17afb5deef4cc74dbe3d93f42e20 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Thu, 18 Jun 2026 07:03:49 +0000 Subject: [PATCH 01/16] Support raw image refs for multimodal rendering --- renderers/base.py | 21 +-- renderers/client.py | 168 +++++++++++--------- renderers/configs.py | 87 ++++++++-- renderers/mm_store.py | 222 ++++++++++++++++++++++++++ renderers/qwen35.py | 78 ++------- renderers/qwen3_vl.py | 360 +++++++++++++++++++++++++++++++++--------- tests/test_client.py | 229 ++++++++++++++++++++++----- 7 files changed, 892 insertions(+), 273 deletions(-) create mode 100644 renderers/mm_store.py diff --git a/renderers/base.py b/renderers/base.py index 8f722d7..4dbb4f4 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -204,11 +204,10 @@ class MultiModalData: """Multimodal sidecar produced alongside the token stream. Renderer output is framework-agnostic: ``mm_items[modality][i]`` is a - plain ``dict`` mirroring the per-item output of a HuggingFace processor - (e.g. ``{"pixel_values": Tensor, "image_grid_thw": Tensor}`` for - Qwen3-VL images). Translation to engine-specific wire formats — vLLM's - ``MultiModalKwargsItem``, SGLang's payload, etc. — happens in the - inference glue layer (see ``renderers.client``). + plain descriptor dict (e.g. ``{"image_grid_thw": [[1, h, w]]}`` for + Qwen-VL images). Translation to engine-specific wire formats — vLLM image + refs, SGLang payloads, etc. — happens in the inference glue layer (see + ``renderers.client``). """ mm_hashes: dict[str, list[str]] = field(default_factory=dict) @@ -761,8 +760,8 @@ def bridge_to_next_turn( Text-only renderers return :class:`RenderedTokens` with ``multi_modal_data=None``. Multimodal renderers (see :class:`MultimodalRenderer`) populate ``multi_modal_data`` so - the caller can recover placeholder offsets + per-item processed - tensors for the new full prompt; they also accept a + the caller can recover placeholder offsets + per-item image + descriptors for the new full prompt; they also accept a ``previous_multi_modal_data`` kwarg via the :class:`MultimodalRenderer` Protocol override. @@ -818,8 +817,8 @@ def bridge_to_next_turn( the combined token sequence and silently falls back to hash-cache lookup (or errors) - returns :class:`RenderedTokens` (not ``list[int]``) so the - caller can recover the placeholder offsets + per-item - processed tensors for the new full prompt + caller can recover the placeholder offsets + per-item image + descriptors for the new full prompt """ ... @@ -967,6 +966,10 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No with self.checkout() as r: return r.bridge_to_next_turn(*args, **kwargs) + def materialize_image_refs(self, *args: Any, **kwargs: Any) -> "MultiModalData": + with self.checkout() as r: + return r.materialize_image_refs(*args, **kwargs) + # ``mm_token_type_id_map`` (the MultimodalRenderer protocol attribute) # is set in ``__init__`` only for pools wrapping multimodal renderers; # see the comment there for why this isn't a class-level property. diff --git a/renderers/client.py b/renderers/client.py index 0c63c0e..de9df0b 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -15,6 +15,7 @@ import json import logging from collections.abc import Mapping +from dataclasses import replace from typing import Any, cast import httpx @@ -156,6 +157,7 @@ async def generate( priority: int | None = None, extra_headers: dict[str, str] | None = None, max_prompt_len: int | None = None, + materialize_all_image_refs: bool = False, ) -> dict[str, Any]: """Tokenize messages, call vLLM /inference/v1/generate, parse the response. @@ -173,8 +175,9 @@ async def generate( For multimodal renderers (e.g. ``Qwen3VLRenderer``), the call goes through ``renderer.render(...)`` to recover the ``multi_modal_data`` sidecar, then serializes it to vLLM's ``features`` schema (mm_hashes, - mm_placeholders, kwargs_data) before POSTing. The serializer imports - ``vllm.*`` lazily so text-only consumers never pay for the import. + mm_placeholders, kwargs_data) before POSTing. Qwen-family image + ``kwargs_data`` slots are either ``None`` (cache lookup for a prior + image) or run image refs (new/current images that vLLM should process). ``max_prompt_len`` controls the pre-flight overflow check. When the rendered prompt is strictly longer than the cap, the request is never @@ -248,11 +251,23 @@ def _prepare(): "token_ids": prompt_ids, "sampling_params": sp, } - features = ( - _build_mm_features(renderer, mm_data) - if mm_data and not mm_data.is_empty() - else None - ) + + def _features_and_descriptor_mm() -> tuple[dict[str, Any] | None, MultiModalData | None]: + if mm_data is None or mm_data.is_empty(): + return None, mm_data + build_mm = mm_data + if materialize_all_image_refs: + materialize = getattr(renderer, "materialize_image_refs", None) + if materialize is None: + raise NotImplementedError( + f"{type(renderer).__name__} cannot materialize image refs for retry." + ) + build_mm = materialize(mm_data, messages) + return _build_vllm_mm_features(renderer, build_mm), _descriptor_only_mm_data(mm_data) + + features, out_mm_data = await _maybe_offload(renderer, _features_and_descriptor_mm) + if prompt_attr is not None and getattr(prompt_attr, "multi_modal_data", None) is not None: + prompt_attr = replace(prompt_attr, multi_modal_data=out_mm_data) if features is not None: body["features"] = features if cache_salt is not None: @@ -322,7 +337,7 @@ def _prepare(): # The mm sidecar consumed on the request side, surfaced back so # callers can persist it on the trajectory step for downstream # multi-turn bridging and training-sample construction. - "multi_modal_data": mm_data, + "multi_modal_data": out_mm_data, # The renderer's per-token attribution for the prompt — either # the RenderedTokens computed here via renderer.render(...) or # the one threaded in by the caller alongside prompt_ids (the @@ -334,7 +349,31 @@ def _prepare(): } -def _build_mm_features( +def _descriptor_only_mm_data(mm_data: MultiModalData) -> MultiModalData: + """Drop one-request image-ref fields before callers persist mm_data.""" + from renderers.mm_store import IMAGE_REF_PAYLOAD_KEY + + new_items: dict[str, list[dict[str, Any]]] = {} + for modality, items in mm_data.mm_items.items(): + new_items[modality] = [ + { + key: value + for key, value in item.items() + if key + not in { + "pixel_values", + "raw_uri", + "raw_image_id", + "image_layout_fingerprint", + IMAGE_REF_PAYLOAD_KEY, + } + } + for item in items + ] + return replace(mm_data, mm_items=new_items) + + +def _build_vllm_mm_features( renderer: Renderer | RendererPool, mm_data: MultiModalData, ) -> dict[str, Any] | None: @@ -342,22 +381,9 @@ def _build_mm_features( vLLM's ``MultiModalFeatures`` carries three things: hashes (for cache lookup), placeholder positions (so the engine knows where in the - token stream each item lives), and per-item ``MultiModalKwargsItem`` - base64-encoded. The encoding requires vLLM-side type info — what - fields belong to each modality, how they batch — and is currently - model-family specific. For now we dispatch on the renderer class; - extend the dispatch table as more multimodal renderers land. - - NOTE — future engine pluggability: this encoder is vLLM 0.20-specific - (uses ``vllm.multimodal.inputs.MultiModalKwargsItems``, - ``vllm.entrypoints.serve.disagg.mm_serde.encode_mm_kwargs_item``, and - ``_create_qwen2vl_field_factory``). When a second inference engine - arrives (SGLang, MAX, ...) the renderer client should be parameterized - on engine: either (a) move the encoder onto the renderer as - ``encode_mm_for_(mm_data)`` methods, or (b) accept an - ``Encoder`` strategy at the ``generate(...)`` call site. The data type - (``MultiModalData``) is already framework-agnostic and does not need - to change. Don't pre-build the abstraction with one engine in tree. + token stream each item lives), and per-item payload selectors. For + Qwen images, payload selectors are ``None`` for cache-only prior images + or run image refs for images vLLM should process. """ from renderers.qwen3_vl import Qwen3VLRenderer from renderers.qwen35 import Qwen35Renderer @@ -369,43 +395,27 @@ def _build_mm_features( renderer.renderer_cls if isinstance(renderer, RendererPool) else type(renderer) ) - # Qwen3-VL and Qwen3.5 both ship ``pixel_values`` + ``image_grid_thw`` - # via the shared Qwen2-VL field factory. ``spatial_merge_size=2`` is - # the family default and matches every Qwen-VL processor in tree. if issubclass(renderer_cls, (Qwen3VLRenderer, Qwen35Renderer)): - return _build_qwen_vl_features(mm_data, spatial_merge_size=2) + return _build_qwen_vl_image_ref_features(mm_data) raise NotImplementedError( f"Multimodal serialization not implemented for {renderer_cls.__name__}. " - "Add a dispatch branch in renderers.client._build_mm_features." + "Add a dispatch branch in renderers.client._build_vllm_mm_features." ) -def _build_qwen_vl_features( - mm_data: MultiModalData, *, spatial_merge_size: int -) -> dict[str, Any]: - """vLLM features payload for the Qwen-VL family (Qwen2-VL / Qwen3-VL). - - Stacks per-image processor outputs back into a batched ``BatchFeature``, - runs the Qwen2-VL field factory (shared across the family), wraps as - ``MultiModalKwargsItems``, base64-encodes each item, and assembles a - JSON-serializable dict matching vLLM's ``MultiModalFeatures`` schema. +def _build_qwen_vl_image_ref_features(mm_data: MultiModalData) -> dict[str, Any]: + """vLLM features payload for Qwen-VL image refs. Returns ``None`` semantics live one level up — this helper assumes the caller already verified ``mm_data`` is non-empty. """ - try: - import torch - from transformers.feature_extraction_utils import BatchFeature - from vllm.entrypoints.serve.disagg.mm_serde import encode_mm_kwargs_item - from vllm.model_executor.models.qwen2_vl import _create_qwen2vl_field_factory - from vllm.multimodal.inputs import MultiModalKwargsItems - except ImportError as exc: - raise RuntimeError( - "Multimodal generate via /inference/v1/generate requires `vllm` " - "and `torch` to encode the features payload. Install vLLM in this " - "environment, or pre-build features upstream." - ) from exc + from renderers.mm_store import ( + IMAGE_REF_PAYLOAD_KEY, + IMAGE_REF_PAYLOAD_VALUE, + current_run_id, + image_ref, + ) out: dict[str, Any] = { "mm_hashes": {}, @@ -415,32 +425,44 @@ def _build_qwen_vl_features( image_items = mm_data.mm_items.get("image") or [] if image_items: - # mm_items now ship numpy arrays (the renderer is torch-free); - # convert at this vLLM-glue boundary where torch is already a - # hard dependency. - pixel_values = torch.cat( - [torch.as_tensor(it["pixel_values"]) for it in image_items], dim=0 - ) - image_grid_thw = torch.cat( - [torch.as_tensor(it["image_grid_thw"]) for it in image_items], dim=0 - ) - hf_inputs = BatchFeature( - data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw} - ) - config = _create_qwen2vl_field_factory(spatial_merge_size)(hf_inputs) - kwargs_items = MultiModalKwargsItems.from_hf_inputs(hf_inputs, config) - encoded = [encode_mm_kwargs_item(it) for it in kwargs_items["image"]] + mm_hashes = list(mm_data.mm_hashes.get("image") or []) + placeholders = list(mm_data.mm_placeholders.get("image") or []) + if len(mm_hashes) != len(image_items) or len(placeholders) != len(image_items): + raise ValueError( + "Qwen-VL mm sidecar length mismatch: " + f"items={len(image_items)} hashes={len(mm_hashes)} placeholders={len(placeholders)}" + ) + + encoded: list[Any] = [None] * len(image_items) + run_id = current_run_id() + for idx, item in enumerate(image_items): + if item.get(IMAGE_REF_PAYLOAD_KEY) != IMAGE_REF_PAYLOAD_VALUE: + continue + raw_image_id = item.get("raw_image_id") + grid_thw = item.get("image_grid_thw") + fingerprint = item.get("image_layout_fingerprint") + if not isinstance(raw_image_id, str) or not raw_image_id: + raise ValueError("image-ref multimodal item is missing raw_image_id") + if grid_thw is None: + raise ValueError("image-ref multimodal item is missing image_grid_thw") + if not isinstance(fingerprint, str) or not fingerprint: + raise ValueError("image-ref multimodal item is missing image_layout_fingerprint") + encoded[idx] = image_ref( + run_id=run_id, + fingerprint=fingerprint, + modality="image", + mm_hash=mm_hashes[idx], + raw_image_id=raw_image_id, + grid_thw=grid_thw, + ) + out["kwargs_data"]["image"] = encoded - out["mm_hashes"]["image"] = list(mm_data.mm_hashes.get("image") or []) + out["mm_hashes"]["image"] = mm_hashes out["mm_placeholders"]["image"] = [ - {"offset": p.offset, "length": p.length} - for p in mm_data.mm_placeholders.get("image") or [] + {"offset": p.offset, "length": p.length} for p in placeholders ] - # If kwargs_data is empty across all modalities, drop the key so vLLM - # falls back to the hash-only (cache-hit) path. Otherwise hand it the - # full payload. - if not any(out["kwargs_data"].values()): + if not any(item is not None for values in out["kwargs_data"].values() for item in values): out["kwargs_data"] = None return out diff --git a/renderers/configs.py b/renderers/configs.py index d500f8e..b07d97e 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -25,6 +25,12 @@ from pydantic import ConfigDict, Field from pydantic_config import BaseConfig +QWEN_VL_IMAGE_PATCH_SIZE = 16 +QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE = 2 +QWEN_VL_IMAGE_MERGE_SIZE = 2 +QWEN_VL_IMAGE_MIN_PIXELS = 65536 +QWEN_VL_IMAGE_MAX_PIXELS = 16777216 + class BaseRendererConfig(BaseConfig): """Shared fields and config for every renderer config variant. @@ -148,11 +154,30 @@ class Qwen35RendererConfig(BaseRendererConfig): running across the entire conversation. Mirrors the chat template's ``add_vision_id`` toggle.""" - image_cache_max: int = 256 - """FIFO bound on the per-renderer image processor cache. Renderer- - internal — not a Jinja chat-template kwarg.""" + image_patch_size: int = QWEN_VL_IMAGE_PATCH_SIZE + """Qwen image patch size used to compute placeholder layout.""" - _internal_fields = frozenset({"image_cache_max"}) + image_temporal_patch_size: int = QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE + """Qwen temporal patch size used in the image layout fingerprint.""" + + image_merge_size: int = QWEN_VL_IMAGE_MERGE_SIZE + """Qwen spatial merge size used to compute image pad-token counts.""" + + image_min_pixels: int = QWEN_VL_IMAGE_MIN_PIXELS + """Minimum resized image area used by Qwen smart-resize layout math.""" + + image_max_pixels: int = QWEN_VL_IMAGE_MAX_PIXELS + """Maximum resized image area used by Qwen smart-resize layout math.""" + + _internal_fields = frozenset( + { + "image_patch_size", + "image_temporal_patch_size", + "image_merge_size", + "image_min_pixels", + "image_max_pixels", + } + ) class Qwen36RendererConfig(BaseRendererConfig): @@ -166,10 +191,30 @@ class Qwen36RendererConfig(BaseRendererConfig): add_vision_id: bool = False """See :class:`Qwen35RendererConfig.add_vision_id`.""" - image_cache_max: int = 256 - """See :class:`Qwen35RendererConfig.image_cache_max`.""" + image_patch_size: int = QWEN_VL_IMAGE_PATCH_SIZE + """See :class:`Qwen35RendererConfig.image_patch_size`.""" - _internal_fields = frozenset({"image_cache_max"}) + image_temporal_patch_size: int = QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE + """See :class:`Qwen35RendererConfig.image_temporal_patch_size`.""" + + image_merge_size: int = QWEN_VL_IMAGE_MERGE_SIZE + """See :class:`Qwen35RendererConfig.image_merge_size`.""" + + image_min_pixels: int = QWEN_VL_IMAGE_MIN_PIXELS + """See :class:`Qwen35RendererConfig.image_min_pixels`.""" + + image_max_pixels: int = QWEN_VL_IMAGE_MAX_PIXELS + """See :class:`Qwen35RendererConfig.image_max_pixels`.""" + + _internal_fields = frozenset( + { + "image_patch_size", + "image_temporal_patch_size", + "image_merge_size", + "image_min_pixels", + "image_max_pixels", + } + ) class Qwen3VLRendererConfig(BaseRendererConfig): @@ -180,10 +225,30 @@ class Qwen3VLRendererConfig(BaseRendererConfig): add_vision_id: bool = False """See :class:`Qwen35RendererConfig.add_vision_id`.""" - image_cache_max: int = 256 - """See :class:`Qwen35RendererConfig.image_cache_max`.""" + image_patch_size: int = QWEN_VL_IMAGE_PATCH_SIZE + """See :class:`Qwen35RendererConfig.image_patch_size`.""" - _internal_fields = frozenset({"image_cache_max"}) + image_temporal_patch_size: int = QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE + """See :class:`Qwen35RendererConfig.image_temporal_patch_size`.""" + + image_merge_size: int = QWEN_VL_IMAGE_MERGE_SIZE + """See :class:`Qwen35RendererConfig.image_merge_size`.""" + + image_min_pixels: int = QWEN_VL_IMAGE_MIN_PIXELS + """See :class:`Qwen35RendererConfig.image_min_pixels`.""" + + image_max_pixels: int = QWEN_VL_IMAGE_MAX_PIXELS + """See :class:`Qwen35RendererConfig.image_max_pixels`.""" + + _internal_fields = frozenset( + { + "image_patch_size", + "image_temporal_patch_size", + "image_merge_size", + "image_min_pixels", + "image_max_pixels", + } + ) class GLM5RendererConfig(BaseRendererConfig): @@ -295,7 +360,7 @@ class KimiK25RendererConfig(BaseRendererConfig): template's native variable name.""" image_cache_max: int = 256 - """See :class:`Qwen35RendererConfig.image_cache_max`.""" + """FIFO bound on Kimi's per-renderer image processor cache.""" _internal_fields = frozenset({"image_cache_max"}) diff --git a/renderers/mm_store.py b/renderers/mm_store.py new file mode 100644 index 0000000..8cabd14 --- /dev/null +++ b/renderers/mm_store.py @@ -0,0 +1,222 @@ +"""Run-scoped image asset helpers for multimodal rendering. + +The renderer stack does not ship processed multimodal features. Images are +written once into the run output tree and messages carry ``file://`` URLs to +those files. Renderers then emit lightweight image refs for vLLM only when the +engine needs to process an image. +""" + +from __future__ import annotations + +import base64 +import hashlib +import os +import re +import threading +from pathlib import Path + +RUN_OUTPUT_ROOT = Path("/data/outputs") + +IMAGE_OFFLOAD_DIR_ENV = "VF_RENDERER_IMAGE_OFFLOAD_DIR" +RUN_DIR_ENV = "PRIME_RL_RUN_DIR" +RUN_ID_ENV = "RUN_ID" + +IMAGE_ASSET_SUBDIR = Path("assets/images") +IMAGE_REF_PREFIX = "mmraw:v1" +IMAGE_REF_PAYLOAD_KEY = "_prime_rl_image_ref" +IMAGE_REF_PAYLOAD_VALUE = "raw_image" + +_SAFE_RUN_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +_SAFE_FINGERPRINT_RE = re.compile(r"^[a-f0-9]{16,64}$") +_SAFE_MM_HASH_RE = re.compile(r"^[a-f0-9]{16,128}$") +_SAFE_IMAGE_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +_SAFE_GRID_THW_RE = re.compile(r"^[0-9]+x[0-9]+x[0-9]+$") + +_MEDIA_TYPE_EXT = {"jpeg": ".jpg", "jpg": ".jpg", "png": ".png", "webp": ".webp", "gif": ".gif"} + + +def normalize_run_id(run_id: str) -> str: + """Return the canonical run id, without the directory's ``run_`` prefix.""" + value = run_id.strip() + if value.startswith("run_"): + value = value[len("run_") :] + if not value or not _SAFE_RUN_ID_RE.fullmatch(value): + raise ValueError(f"Invalid run id: {run_id!r}") + return value + + +def run_dir_name(run_id: str) -> str: + return f"run_{normalize_run_id(run_id)}" + + +def current_run_id() -> str: + """Best-effort run id for refs emitted by this process.""" + raw = os.getenv(RUN_ID_ENV, "").strip() + if raw: + return normalize_run_id(raw) + + run_dir = os.getenv(RUN_DIR_ENV, "").strip() + if run_dir: + return normalize_run_id(Path(run_dir).name) + + image_dir = os.getenv(IMAGE_OFFLOAD_DIR_ENV, "").strip() + if image_dir: + # Expected shape is /assets/images. If callers pass another + # explicit directory, the ref's run segment is only a stable label; the + # path resolver will use the explicit directory in every process. + path = Path(image_dir).resolve() + if path.name == "images" and path.parent.name == "assets": + try: + return normalize_run_id(path.parent.parent.name) + except ValueError: + pass + return "explicit" + + raise RuntimeError( + f"Set {IMAGE_OFFLOAD_DIR_ENV}, {RUN_DIR_ENV}, or {RUN_ID_ENV} before emitting image refs." + ) + + +def run_dir(run_id: str | None = None) -> Path: + """Resolve the run output directory. + + Resolution order: + 1. ``PRIME_RL_RUN_DIR`` as an exact run directory. + 2. ``RUN_ID`` or explicit ``run_id`` under ``/data/outputs/run_``. + """ + explicit = os.getenv(RUN_DIR_ENV, "").strip() + if explicit: + return Path(explicit).resolve() + + value = run_id or os.getenv(RUN_ID_ENV, "").strip() + if not value: + raise RuntimeError(f"Set {RUN_DIR_ENV} or {RUN_ID_ENV} before resolving a run directory.") + return (RUN_OUTPUT_ROOT / run_dir_name(value)).resolve() + + +def run_image_dir(run_id: str | None = None) -> Path: + """Resolve the directory for raw image assets for a run.""" + explicit = os.getenv(IMAGE_OFFLOAD_DIR_ENV, "").strip() + if explicit: + return Path(explicit).resolve() + return (run_dir(run_id) / IMAGE_ASSET_SUBDIR).resolve() + + +def image_asset_dir(run_id: str | None = None) -> Path: + """Alias for callers that already use the assets terminology.""" + return run_image_dir(run_id) + + +def _media_type_ext(media_type: str) -> str: + subtype = media_type.split("/", 1)[-1].split(";", 1)[0].strip().lower() + return _MEDIA_TYPE_EXT.get(subtype, ".img") + + +def offload_image_to_run_assets(url: object, image_dir: Path | None = None) -> tuple[str, int] | None: + """Decode a base64 data image into the run image assets directory. + + Returns ``(file_url, byte_count)`` when ``url`` was rewritten and ``None`` + for non-data-image values. Writes are content-addressed and atomic. + """ + if not isinstance(url, str) or not url.startswith("data:image/"): + return None + marker = ";base64," + if marker not in url: + return None + + header, b64 = url.split(marker, 1) + try: + raw = base64.b64decode(b64) + except Exception: + return None + + root = (image_dir or run_image_dir()).resolve() + root.mkdir(parents=True, exist_ok=True) + digest = hashlib.sha256(raw).hexdigest()[:16] + path = root / f"{digest}{_media_type_ext(header[len('data:') :])}" + if not path.exists(): + tmp = path.with_name(f".{path.name}.{os.getpid()}.{threading.get_ident()}.tmp") + tmp.write_bytes(raw) + os.replace(tmp, path) + else: + try: + path.touch() + except OSError: + pass + return path.as_uri(), len(raw) + + +def raw_image_path(*, run_id: str, raw_image_id: str) -> Path: + if not _SAFE_IMAGE_ID_RE.fullmatch(raw_image_id): + raise ValueError(f"Invalid raw image id: {raw_image_id!r}") + root = run_image_dir(run_id) + path = (root / raw_image_id).resolve() + if not path.is_relative_to(root): + raise ValueError(f"Raw image path escaped root: {path}") + return path + + +def image_layout_fingerprint( + *, + family: str, + patch_size: int, + merge_size: int, + temporal_patch_size: int, + min_pixels: int, + max_pixels: int, +) -> str: + raw = ( + f"image-layout:v1:{family}:{int(patch_size)}:{int(merge_size)}:" + f"{int(temporal_patch_size)}:{int(min_pixels)}:{int(max_pixels)}" + ).encode("utf-8") + return hashlib.sha256(raw).hexdigest()[:32] + + +def _grid_to_ref(grid_thw: object) -> str: + data = grid_thw.tolist() if hasattr(grid_thw, "tolist") else grid_thw + if isinstance(data, list) and data and isinstance(data[0], list): + data = data[0] + if not isinstance(data, (list, tuple)) or len(data) != 3: + raise ValueError(f"Invalid image grid_thw for image ref: {grid_thw!r}") + return "x".join(str(int(v)) for v in data) + + +def _grid_from_ref(value: str) -> list[int]: + if not _SAFE_GRID_THW_RE.fullmatch(value): + raise ValueError(f"Invalid image grid_thw ref segment: {value!r}") + return [int(v) for v in value.split("x")] + + +def image_ref( + *, + run_id: str, + fingerprint: str, + modality: str, + mm_hash: str, + raw_image_id: str, + grid_thw: object, +) -> str: + run_id = normalize_run_id(run_id) + if not _SAFE_FINGERPRINT_RE.fullmatch(fingerprint): + raise ValueError(f"Invalid image layout fingerprint: {fingerprint!r}") + if modality != "image": + raise ValueError(f"Unsupported image ref modality: {modality!r}") + if not _SAFE_MM_HASH_RE.fullmatch(mm_hash): + raise ValueError(f"Invalid image hash: {mm_hash!r}") + raw_image_path(run_id=run_id, raw_image_id=raw_image_id) + return f"{IMAGE_REF_PREFIX}:{run_id}:{fingerprint}:{modality}:{mm_hash}:{raw_image_id}:{_grid_to_ref(grid_thw)}" + + +def split_image_ref(ref: str) -> tuple[str, str, str, str, str, list[int]]: + parts = ref.split(":") + if parts[:2] != ["mmraw", "v1"] or len(parts) != 8: + raise ValueError(f"Invalid image ref shape: {ref!r}") + return normalize_run_id(parts[2]), parts[3], parts[4], parts[5], parts[6], _grid_from_ref(parts[7]) + + +# Backwards-compatible names for consumers that already speak the mmraw wire format. +MMRAW_PREFIX = IMAGE_REF_PREFIX +MM_RAW_PAYLOAD_KEY = IMAGE_REF_PAYLOAD_KEY +MM_RAW_PAYLOAD_VALUE = IMAGE_REF_PAYLOAD_VALUE +mmraw_ref = image_ref +split_mmraw_ref = split_image_ref diff --git a/renderers/qwen35.py b/renderers/qwen35.py index cdb8ee1..c0b76d6 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -7,9 +7,9 @@ processor class ``Qwen3VLProcessor``). When a user/tool message carries an ``ImagePart``, the renderer emits the same ``<|vision_start|>``+N×``<|image_pad|>`` +``<|vision_end|>`` expansion as the HF chat template (``N = -image_grid_thw.prod() // merge_size**2``) and ships processed pixel_values via -``RenderedTokens.multi_modal_data``. Text-only inputs take the original fast -path and remain byte-identical to ``apply_chat_template``. +image_grid_thw.prod() // merge_size**2``) using renderer-declared image layout +metadata. It does not call the HF image processor; vLLM receives run image refs +for images it must process. """ from __future__ import annotations @@ -35,10 +35,10 @@ from renderers.configs import Qwen35RendererConfig from renderers.parsing import parse_qwen35 from renderers.qwen3_vl import ( - _image_hash, _is_image_part, _is_video_part, - _load_pil_image, + materialize_image_refs, + qwen_image_item_for_render, ) # --------------------------------------------------------------------------- @@ -120,7 +120,7 @@ def __init__( processor: Any = None, ): self._tokenizer = tokenizer - self._processor = processor + _ = processor cfg = config or type(self)._config_cls() # ``enable_thinking=None`` defers to the model's known default (see # ``_ENABLE_THINKING_DEFAULTS``). Materialise here so downstream reads @@ -147,11 +147,6 @@ def __init__( self._image_pad = self._token_id("<|image_pad|>") self._video_pad = self._token_id("<|video_pad|>") - # Per-instance image-processor cache; see Qwen3VLRenderer for the - # rationale (FIFO-bounded; same image seen across rollouts / - # bridge re-renders). - self._image_cache: dict[str, tuple[Any, int]] = {} - @property def mm_token_type_id_map(self) -> dict[int, int]: """Token-id → modality marker (1 = image, 2 = video) used by the @@ -160,45 +155,10 @@ def mm_token_type_id_map(self) -> dict[int, int]: """ return {self._image_pad: 1, self._video_pad: 2} - def _get_processor(self): - if self._processor is not None: - return self._processor - from transformers import AutoProcessor - - name = getattr(self._tokenizer, "name_or_path", None) - if not name: - raise RuntimeError( - "Qwen35Renderer needs a processor to render image / video parts. " - "Pass `processor=AutoProcessor.from_pretrained(...)` to the " - "constructor, or load the tokenizer with a known name_or_path " - "so the processor can be auto-loaded." - ) - self._processor = AutoProcessor.from_pretrained(name) - return self._processor - - def _process_image(self, part: dict[str, Any]): - """Resolve, process, and characterize a single image part. - - Returns ``(pil, processor_out, num_image_tokens, image_hash)``. - Mirrors ``Qwen3VLRenderer._process_image``: hashes the loaded PIL, - consults ``self._image_cache``, runs the HF image processor on - miss, FIFO-evicts on overflow. - """ - pil = _load_pil_image(part) - h = _image_hash(pil) - cached = self._image_cache.get(h) - if cached is not None: - out, num_image_tokens = cached - return pil, out, num_image_tokens, h - proc = self._get_processor() - out = proc.image_processor(images=[pil], return_tensors="np") - grid_thw = out["image_grid_thw"][0] - merge_size = proc.image_processor.merge_size - num_image_tokens = int(grid_thw.prod()) // (merge_size * merge_size) - if len(self._image_cache) >= self.config.image_cache_max: - self._image_cache.pop(next(iter(self._image_cache))) - self._image_cache[h] = (out, num_image_tokens) - return pil, out, num_image_tokens, h + def materialize_image_refs( + self, mm_data: MultiModalData, messages: list[Message] + ) -> MultiModalData: + return materialize_image_refs(self, mm_data, messages) @staticmethod def _content_has_media(content: Any) -> bool: @@ -364,7 +324,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None: # image data, so they ARE body content (is_content=True); # the surrounding ``<|vision_start|>`` / ``<|vision_end|>`` # specials are template scaffold. - _, out, n, h = self._process_image(part) + n, h, mm_item = qwen_image_item_for_render(self, part) vision_counts["image"] += 1 if self.config.add_vision_id: emit_text( @@ -386,12 +346,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None: mm_placeholders.setdefault("image", []).append( PlaceholderRange(offset=offset, length=n) ) - mm_items.setdefault("image", []).append( - { - "pixel_values": out["pixel_values"], - "image_grid_thw": out["image_grid_thw"], - } - ) + mm_items.setdefault("image", []).append(mm_item) def emit_user_with_media(content_list: list[Any], msg_idx: int) -> None: """Emit a user message whose content list contains image parts. @@ -715,7 +670,7 @@ def emit_text_segments( content_mask.append(is_content) def emit_image(part: dict[str, Any], msg_idx: int = -1) -> None: - _, out, n, h = self._process_image(part) + n, h, mm_item = qwen_image_item_for_render(self, part) vision_counts["image"] += 1 if self.config.add_vision_id: emit_text(f"Picture {vision_counts['image']}: ", msg_idx) @@ -728,12 +683,7 @@ def emit_image(part: dict[str, Any], msg_idx: int = -1) -> None: new_placeholders.setdefault("image", []).append( PlaceholderRange(offset=offset, length=n) ) - new_items.setdefault("image", []).append( - { - "pixel_values": out["pixel_values"], - "image_grid_thw": out["image_grid_thw"], - } - ) + new_items.setdefault("image", []).append(mm_item) def emit_user_with_media(content_list: list[Any], msg_idx: int) -> None: emit_special(self._im_start, msg_idx) diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 7b82d7e..9b865d0 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -6,14 +6,11 @@ for image inputs as the HF processor (``N = image_grid_thw.prod() // merge_size**2``). -Image data is shipped to the inference engine via -``RenderedTokens.multi_modal_data``: ``mm_placeholders`` records the -``(offset, length)`` span of each image's placeholder tokens in the -prompt, ``mm_items`` carries the per-image processor output -(``pixel_values``, ``image_grid_thw``), and ``mm_hashes`` carries a -stable identifier for cache lookup. The wire-format conversion to -vLLM's ``/inference/v1/generate`` ``features`` field lives in -``renderers.client``. +Image data is shipped to the inference engine via run image refs, not +processed image-processor payloads. ``RenderedTokens.multi_modal_data`` +records placeholder spans, stable image hashes, and Qwen layout metadata +(``image_grid_thw``) so vLLM can cache-match prior images and process new +image refs itself. BPE boundary discipline: text runs that the chat template emits contiguously (e.g. ``"user\\n" + content_text``) must be encoded as a @@ -30,8 +27,11 @@ import hashlib import io import json +import math +from dataclasses import dataclass, replace +from pathlib import Path from typing import Any -from urllib.parse import urlparse +from urllib.parse import unquote, urlparse from transformers.tokenization_utils import PreTrainedTokenizer @@ -48,6 +48,11 @@ trim_to_turn_close, ) from renderers.configs import Qwen3VLRendererConfig +from renderers.mm_store import ( + IMAGE_REF_PAYLOAD_KEY, + IMAGE_REF_PAYLOAD_VALUE, + image_layout_fingerprint, +) from renderers.parsing import parse_qwen3 _TOOLS_HEADER = ( @@ -163,6 +168,261 @@ def _image_hash(pil_image) -> str: return h.hexdigest()[:32] +@dataclass(frozen=True) +class QwenImageLayoutConfig: + patch_size: int + temporal_patch_size: int + merge_size: int + min_pixels: int + max_pixels: int + + +@dataclass(frozen=True) +class QwenImageLayoutDescriptor: + mm_hash: str + image_grid_thw: list[list[int]] + num_image_tokens: int + fingerprint: str + raw_uri: str | None = None + raw_image_id: str | None = None + + +def qwen_image_layout_config_for_renderer(renderer: Any) -> QwenImageLayoutConfig: + config = renderer.config + values = { + "patch_size": getattr(config, "image_patch_size", None), + "temporal_patch_size": getattr(config, "image_temporal_patch_size", None), + "merge_size": getattr(config, "image_merge_size", None), + "min_pixels": getattr(config, "image_min_pixels", None), + "max_pixels": getattr(config, "image_max_pixels", None), + } + missing = [name for name, value in values.items() if value is None] + if missing: + raise RuntimeError( + "Qwen image layout must be declared on the renderer config; missing " + + ", ".join(missing) + ) + return QwenImageLayoutConfig( + patch_size=int(values["patch_size"]), + temporal_patch_size=int(values["temporal_patch_size"]), + merge_size=int(values["merge_size"]), + min_pixels=int(values["min_pixels"]), + max_pixels=int(values["max_pixels"]), + ) + + +def _smart_resize( + height: int, + width: int, + *, + factor: int, + min_pixels: int, + max_pixels: int, +) -> tuple[int, int]: + """Qwen image resize math without materializing resized pixels.""" + if height <= 0 or width <= 0: + raise ValueError(f"image dimensions must be positive, got {height}x{width}") + if max(height, width) / min(height, width) > 200: + raise ValueError( + "absolute aspect ratio must be smaller than 200, got " + f"{max(height, width) / min(height, width)}" + ) + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = max(factor, math.floor(height / beta / factor) * factor) + w_bar = max(factor, math.floor(width / beta / factor) * factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + return h_bar, w_bar + + +def _image_source(item: dict[str, Any]) -> Any: + if "image" in item: + return item["image"] + if "image_url" in item: + image_url = item.get("image_url") + return image_url.get("url") if isinstance(image_url, dict) else image_url + return item.get("url") or item.get("path") + + +def _file_path_from_source(source: Any) -> Path | None: + if not isinstance(source, str): + return None + parsed = urlparse(source) + if parsed.scheme == "file": + return Path(unquote(parsed.path)).resolve() + if parsed.scheme == "": + return Path(source).resolve() + return None + + +def _image_dimensions(source: Any) -> tuple[int, int]: + try: + from PIL import Image + except ImportError as exc: + raise RuntimeError( + "Pillow is required to read image dimensions for multimodal rendering." + ) from exc + + path = _file_path_from_source(source) + if path is not None: + with Image.open(path) as image: + return image.height, image.width + + image = _load_pil_image({"image": source}) + return image.height, image.width + + +def _image_content_hash(source: Any) -> str: + path = _file_path_from_source(source) + if path is not None: + return hashlib.sha256(path.read_bytes()).hexdigest()[:32] + return _image_hash(_load_pil_image({"image": source})) + + +def _raw_uri_and_id(source: Any) -> tuple[str | None, str | None]: + path = _file_path_from_source(source) + if path is None: + return None, None + return path.as_uri(), path.name + + +def describe_qwen_image_layout(renderer: Any, part: dict[str, Any]) -> QwenImageLayoutDescriptor: + """Return Qwen image layout metadata without invoking an image processor.""" + source = _image_source(part) + height, width = _image_dimensions(source) + layout = qwen_image_layout_config_for_renderer(renderer) + resized_h, resized_w = _smart_resize( + height, + width, + factor=layout.patch_size * layout.merge_size, + min_pixels=layout.min_pixels, + max_pixels=layout.max_pixels, + ) + grid_t = 1 + grid_h = resized_h // layout.patch_size + grid_w = resized_w // layout.patch_size + num_image_tokens = grid_t * grid_h * grid_w // (layout.merge_size * layout.merge_size) + fingerprint = image_layout_fingerprint( + family="qwen_vl", + patch_size=layout.patch_size, + merge_size=layout.merge_size, + temporal_patch_size=layout.temporal_patch_size, + min_pixels=layout.min_pixels, + max_pixels=layout.max_pixels, + ) + raw_uri, raw_image_id = _raw_uri_and_id(source) + return QwenImageLayoutDescriptor( + mm_hash=_image_content_hash(source), + image_grid_thw=[[grid_t, grid_h, grid_w]], + num_image_tokens=num_image_tokens, + fingerprint=fingerprint, + raw_uri=raw_uri, + raw_image_id=raw_image_id, + ) + + +def qwen_image_item_for_render(renderer: Any, part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: + desc = describe_qwen_image_layout(renderer, part) + item: dict[str, Any] = {"image_grid_thw": desc.image_grid_thw} + if desc.raw_uri is not None and desc.raw_image_id is not None: + item.update( + { + "raw_uri": desc.raw_uri, + "raw_image_id": desc.raw_image_id, + "image_layout_fingerprint": desc.fingerprint, + IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, + } + ) + return desc.num_image_tokens, desc.mm_hash, item + + +def _iter_image_parts(messages: list[Any]): + for msg in messages or []: + content = msg.get("content") if isinstance(msg, dict) else None + if not isinstance(content, list): + continue + for item in content: + if isinstance(item, dict) and _is_image_part(item): + yield item + + +def _grids_equal(a: Any, b: Any) -> bool: + if a is None or b is None: + return False + al = a.tolist() if hasattr(a, "tolist") else list(a) + bl = b.tolist() if hasattr(b, "tolist") else list(b) + return al == bl + + +def materialize_image_refs(renderer: Any, mm_data: MultiModalData, messages: list[Message]) -> MultiModalData: + """Attach run-image refs to every Qwen image descriptor that can be found.""" + image_items = mm_data.mm_items.get("image") or [] + if not image_items: + return mm_data + hashes = mm_data.mm_hashes.get("image") or [] + if len(hashes) != len(image_items): + raise ValueError( + "materialize_image_refs: mm_hashes/mm_items length mismatch " + f"({len(hashes)} vs {len(image_items)})" + ) + + missing = set(hashes) + resolved: dict[str, QwenImageLayoutDescriptor] = {} + for part in _iter_image_parts(messages): + if not missing: + break + desc = describe_qwen_image_layout(renderer, part) + if desc.mm_hash in missing: + resolved[desc.mm_hash] = desc + missing.discard(desc.mm_hash) + if missing: + raise ValueError( + f"materialize_image_refs: {len(missing)} image hash(es) not found in messages" + ) + + new_image_items: list[dict[str, Any]] = [] + for i, item in enumerate(image_items): + desc = resolved[hashes[i]] + if desc.raw_uri is None or desc.raw_image_id is None: + raise ValueError("materialize_image_refs requires file-backed image URLs") + item_grid = item.get("image_grid_thw") + if item_grid is not None and not _grids_equal(desc.image_grid_thw, item_grid): + raise ValueError( + "materialize_image_refs: reconstructed image_grid_thw " + f"{desc.image_grid_thw!r} != descriptor {item_grid!r}" + ) + new_item = { + k: v + for k, v in item.items() + if k + not in { + "raw_uri", + "raw_image_id", + "image_layout_fingerprint", + IMAGE_REF_PAYLOAD_KEY, + } + } + new_item.update( + { + "image_grid_thw": item_grid if item_grid is not None else desc.image_grid_thw, + "raw_uri": desc.raw_uri, + "raw_image_id": desc.raw_image_id, + "image_layout_fingerprint": desc.fingerprint, + IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, + } + ) + new_image_items.append(new_item) + + new_items = dict(mm_data.mm_items) + new_items["image"] = new_image_items + return replace(mm_data, mm_items=new_items) + + class _Emitter: """Token-stream builder with BPE-safe text buffering. @@ -296,11 +556,9 @@ class Qwen3VLRenderer: config: Typed renderer config (see :class:`renderers.Qwen3VLRendererConfig`). Defaults to a blank config with template defaults. - processor: Optional ``Qwen3VLProcessor``. Required when rendering - messages that contain image / video parts. If not supplied, - the renderer lazy-loads it via ``AutoProcessor.from_pretrained`` - keyed off ``tokenizer.name_or_path`` the first time a - multimodal part is seen. + processor: Deprecated and ignored. Image layout is declared by the + renderer config; the renderer never loads or calls an HF image + processor. ``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls`` on the config are no-ops here — the chat template drops past @@ -315,7 +573,7 @@ def __init__( processor: Any = None, ): self._tokenizer = tokenizer - self._processor = processor + _ = processor self.config = config or Qwen3VLRendererConfig() self._im_start = self._token_id("<|im_start|>") @@ -331,16 +589,6 @@ def __init__( self._image_pad = self._token_id("<|image_pad|>") self._video_pad = self._token_id("<|video_pad|>") - # Per-instance image-processor cache. The HF image processor is the - # most expensive step on the renderer hot path (~tens of ms per - # image for typical grid_thw). The same image gets re-seen across - # ``rollouts_per_example`` rollouts of one example and (for - # multi-turn) across turn boundaries when the bridge re-renders - # rather than extends. Cache keyed by content hash — values are - # tuples of ``(processor_out, num_image_tokens)`` — bounded to - # avoid unbounded growth on long-lived pools. - self._image_cache: dict[str, tuple[Any, int]] = {} - def _token_id(self, token: str) -> int: tid = self._tokenizer.convert_tokens_to_ids(token) assert isinstance(tid, int) and tid != self._tokenizer.unk_token_id, ( @@ -366,22 +614,6 @@ def _encode(self, text: str) -> list[int]: return [] return self._tokenizer.encode(text, add_special_tokens=False) - def _get_processor(self): - if self._processor is not None: - return self._processor - from transformers import AutoProcessor - - name = getattr(self._tokenizer, "name_or_path", None) - if not name: - raise RuntimeError( - "Qwen3VLRenderer needs a processor to render image / video parts. " - "Pass `processor=AutoProcessor.from_pretrained(...)` to the " - "constructor, or load the tokenizer with a known name_or_path " - "so the processor can be auto-loaded." - ) - self._processor = AutoProcessor.from_pretrained(name) - return self._processor - @staticmethod def _render_text_content(content: Any) -> str: """Flatten a content list to a single text string, dropping media parts. @@ -410,30 +642,10 @@ def _render_text_content(content: Any) -> str: return "".join(parts) raise TypeError(f"Unexpected content type: {type(content)}") - def _process_image(self, part: dict[str, Any]): - """Resolve, process, and characterize a single image part. - - Returns ``(pil, processor_out, num_image_tokens, image_hash)``. - Hashes the loaded PIL first and consults ``self._image_cache``; - on hit the HF image-processor call is skipped entirely. - """ - pil = _load_pil_image(part) - h = _image_hash(pil) - cached = self._image_cache.get(h) - if cached is not None: - out, num_image_tokens = cached - return pil, out, num_image_tokens, h - proc = self._get_processor() - out = proc.image_processor(images=[pil], return_tensors="np") - grid_thw = out["image_grid_thw"][0] - merge_size = proc.image_processor.merge_size - num_image_tokens = int(grid_thw.prod()) // (merge_size * merge_size) - if len(self._image_cache) >= self.config.image_cache_max: - # FIFO eviction — Python dicts preserve insertion order, so - # ``next(iter(...))`` is the oldest key. - self._image_cache.pop(next(iter(self._image_cache))) - self._image_cache[h] = (out, num_image_tokens) - return pil, out, num_image_tokens, h + def materialize_image_refs( + self, mm_data: MultiModalData, messages: list[Message] + ) -> MultiModalData: + return materialize_image_refs(self, mm_data, messages) def render( self, @@ -464,7 +676,7 @@ def emit_image(part: dict[str, Any]) -> None: # image data, so they ARE body content (is_content=True); # the surrounding ``<|vision_start|>`` / ``<|vision_end|>`` # markers are renderer-emitted scaffold. - _, out, n, h = self._process_image(part) + n, h, mm_item = qwen_image_item_for_render(self, part) vision_counts["image"] += 1 if self.config.add_vision_id: em.text( @@ -481,12 +693,7 @@ def emit_image(part: dict[str, Any]) -> None: mm_placeholders.setdefault("image", []).append( PlaceholderRange(offset=offset, length=n) ) - mm_items.setdefault("image", []).append( - { - "pixel_values": out["pixel_values"], - "image_grid_thw": out["image_grid_thw"], - } - ) + mm_items.setdefault("image", []).append(mm_item) def render_media_content(content: Any) -> None: """Emit a user/tool content list with media handled inline. @@ -730,7 +937,7 @@ def bridge_to_next_turn( vision_counts = {"image": prev_image_count, "video": prev_video_count} def emit_image(part: dict[str, Any]) -> None: - _, out, n, h = self._process_image(part) + n, h, mm_item = qwen_image_item_for_render(self, part) vision_counts["image"] += 1 if self.config.add_vision_id: em.text( @@ -747,12 +954,7 @@ def emit_image(part: dict[str, Any]) -> None: new_placeholders.setdefault("image", []).append( PlaceholderRange(offset=offset, length=n) ) - new_items.setdefault("image", []).append( - { - "pixel_values": out["pixel_values"], - "image_grid_thw": out["image_grid_thw"], - } - ) + new_items.setdefault("image", []).append(mm_item) def render_media_content(content: Any) -> None: if isinstance(content, str): diff --git a/tests/test_client.py b/tests/test_client.py index 1cc1000..c1c7aaf 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,5 +1,6 @@ import asyncio import base64 +import hashlib import json import httpx @@ -101,6 +102,141 @@ async def post(self, path, *, cast_to=dict, body=None, options=None): ) +def test_run_image_dir_resolution_prefers_explicit_image_dir(tmp_path, monkeypatch): + from renderers.mm_store import run_image_dir + + image_dir = tmp_path / "custom-images" + monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) + monkeypatch.setenv("PRIME_RL_RUN_DIR", str(tmp_path / "run_other")) + monkeypatch.setenv("RUN_ID", "other") + + assert run_image_dir() == image_dir.resolve() + + +def test_run_image_dir_resolution_owns_run_prefix(monkeypatch): + from renderers.mm_store import run_image_dir + + monkeypatch.delenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", raising=False) + monkeypatch.delenv("PRIME_RL_RUN_DIR", raising=False) + monkeypatch.setenv("RUN_ID", "run_abc") + + assert run_image_dir().as_posix() == "/data/outputs/run_abc/assets/images" + + +class _TinyQwenTokenizer: + unk_token_id = -1 + _specials = { + "<|im_start|>": 1, + "<|im_end|>": 2, + "<|endoftext|>": 3, + "": 4, + "": 5, + "": 6, + "": 7, + "": 8, + "<|vision_start|>": 9, + "<|vision_end|>": 10, + "<|image_pad|>": 11, + "<|video_pad|>": 12, + } + + def convert_tokens_to_ids(self, token): + return self._specials.get(token, self.unk_token_id) + + def encode(self, text, add_special_tokens=False): + return [100 + ord(ch) % 50 for ch in text] + + +def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): + pytest.importorskip("PIL") + from PIL import Image + from renderers.mm_store import IMAGE_REF_PAYLOAD_KEY, IMAGE_REF_PAYLOAD_VALUE + from renderers.qwen3_vl import Qwen3VLRenderer + + image_path = tmp_path / "image.png" + Image.new("RGB", (32, 32), color=(255, 0, 0)).save(image_path) + renderer = Qwen3VLRenderer(_TinyQwenTokenizer()) + + rendered = renderer.render( + [ + { + "role": "user", + "content": [{"type": "image_url", "image_url": {"url": image_path.as_uri()}}], + } + ], + add_generation_prompt=True, + ) + + item = rendered.multi_modal_data.mm_items["image"][0] + assert "pixel_values" not in item + assert item["image_grid_thw"] == [[1, 16, 16]] + assert item["raw_image_id"] == "image.png" + assert item[IMAGE_REF_PAYLOAD_KEY] == IMAGE_REF_PAYLOAD_VALUE + assert rendered.multi_modal_data.mm_placeholders["image"][0].length == 64 + + +def test_generate_materialize_all_image_refs_rehydrates_descriptor_slots(tmp_path, monkeypatch): + pytest.importorskip("PIL") + from PIL import Image + + from renderers.base import MultiModalData, ParsedResponse, PlaceholderRange + from renderers.mm_store import split_image_ref + from renderers.qwen3_vl import Qwen3VLRenderer + + class _RetryRenderer(Qwen3VLRenderer): + supports_tools = True + + def get_stop_token_ids(self): + return [99] + + def parse_response(self, completion_ids, *, tools=None): + return ParsedResponse(content="done") + + image_dir = tmp_path / "run_retry" / "assets" / "images" + image_dir.mkdir(parents=True) + image_path = image_dir / "image.png" + Image.new("RGB", (32, 32), color=(0, 255, 0)).save(image_path) + monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) + monkeypatch.setenv("RUN_ID", "retry") + + mm_hash = hashlib.sha256(image_path.read_bytes()).hexdigest()[:32] + mm_data = MultiModalData( + mm_hashes={"image": [mm_hash]}, + mm_placeholders={"image": [PlaceholderRange(offset=5, length=64)]}, + mm_items={"image": [{"image_grid_thw": [[1, 16, 16]]}]}, + ) + renderer = _RetryRenderer(_TinyQwenTokenizer()) + client = _FakeClient() + + asyncio.run( + generate( + client=client, + renderer=renderer, + messages=[ + { + "role": "user", + "content": [{"type": "image_url", "image_url": {"url": image_path.as_uri()}}], + } + ], + model="qwen3-vl", + prompt_ids=list(range(20)), + multi_modal_data=mm_data, + sampling_params={"max_tokens": 4}, + materialize_all_image_refs=True, + ) + ) + + ref = client.calls[0]["body"]["features"]["kwargs_data"]["image"][0] + run_id, _fingerprint, modality, parsed_hash, raw_image_id, grid = split_image_ref(ref) + assert (run_id, modality, parsed_hash, raw_image_id, grid) == ( + "retry", + "image", + mm_hash, + "image.png", + [1, 16, 16], + ) + + def test_generate_builds_request_body_and_parses_response(): client = _FakeClient() renderer = _FakeRenderer() @@ -281,47 +417,63 @@ def test_generate_threads_prompt_attribution_through_prebuilt_prompt_path(): @pytest.mark.parametrize( - "model_id,renderer_class_path", + "renderer_class_path", [ - ("Qwen/Qwen3-VL-4B-Instruct", "renderers.qwen3_vl:Qwen3VLRenderer"), - ("Qwen/Qwen3.5-2B", "renderers.qwen35:Qwen35Renderer"), + "renderers.qwen3_vl:Qwen3VLRenderer", + "renderers.qwen35:Qwen35Renderer", ], ids=["qwen3_vl", "qwen35"], ) -def test_generate_serializes_multimodal_features_for_qwen_vl_family( - model_id, renderer_class_path +def test_generate_serializes_image_refs_for_qwen_vl_family( + tmp_path, monkeypatch, renderer_class_path ): """When the renderer emits ``MultiModalData``, ``generate`` translates it into vLLM's ``features`` payload (mm_hashes + mm_placeholders + - base64-encoded kwargs_data) and sticks it in the request body. Covers - every renderer routed through ``_build_qwen_vl_features``.""" + image-ref kwargs_data) and sticks it in the request body. Descriptor-only + images stay ``None`` so vLLM can resolve them from its cache.""" import importlib - pytest.importorskip("torch") - pytest.importorskip("vllm", reason="vllm needed for features serialization") - - import torch as _torch from renderers.base import ( MultiModalData, + ParsedResponse, PlaceholderRange, - load_tokenizer, + ) + from renderers.mm_store import ( + IMAGE_REF_PAYLOAD_KEY, + IMAGE_REF_PAYLOAD_VALUE, + image_layout_fingerprint, + split_image_ref, ) mod_name, cls_name = renderer_class_path.split(":") renderer_cls = getattr(importlib.import_module(mod_name), cls_name) - # Build a minimal real renderer so type dispatch in - # _build_mm_features hits the qwen branch. The tokenizer is only - # touched in __init__ to grab special-token ids; render() / etc. - # aren't called here because we pre-supply prompt_ids + mm_data. - tokenizer = load_tokenizer(model_id) - renderer = renderer_cls(tokenizer) + class _BareRenderer(renderer_cls): + supports_tools = True + + def get_stop_token_ids(self): + return [99] + + def parse_response(self, completion_ids, *, tools=None): + return ParsedResponse(content="done") + + renderer = _BareRenderer.__new__(_BareRenderer) + image_dir = tmp_path / "run_rawtest" / "assets" / "images" + image_dir.mkdir(parents=True) + (image_dir / "image.png").write_bytes(b"image-bytes") + monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) + monkeypatch.setenv("RUN_ID", "rawtest") + fingerprint = image_layout_fingerprint( + family="qwen_vl", + patch_size=16, + merge_size=2, + temporal_patch_size=2, + min_pixels=65536, + max_pixels=16777216, + ) - # Two synthetic 1×2×2 images. Field factory expects pixel_values - # shape ``(sum_HW, embed_dim)`` and grid_thw shape ``(N, 3)``; the - # values themselves don't matter for the encoding round-trip. mm_data = MultiModalData( - mm_hashes={"image": ["aaa", "bbb"]}, + mm_hashes={"image": ["a" * 32, "b" * 32]}, mm_placeholders={ "image": [ PlaceholderRange(offset=5, length=1), @@ -331,19 +483,18 @@ def test_generate_serializes_multimodal_features_for_qwen_vl_family( mm_items={ "image": [ { - "pixel_values": _torch.zeros(4, 8, dtype=_torch.float32), - "image_grid_thw": _torch.tensor([[1, 2, 2]], dtype=_torch.int64), - }, - { - "pixel_values": _torch.zeros(4, 8, dtype=_torch.float32), - "image_grid_thw": _torch.tensor([[1, 2, 2]], dtype=_torch.int64), + "image_grid_thw": [[1, 2, 2]], + "raw_image_id": "image.png", + "image_layout_fingerprint": fingerprint, + IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, }, + {"image_grid_thw": [[1, 2, 2]]}, ], }, ) client = _FakeClient() - asyncio.run( + result = asyncio.run( generate( client=client, renderer=renderer, @@ -358,17 +509,21 @@ def test_generate_serializes_multimodal_features_for_qwen_vl_family( body = client.calls[0]["body"] assert "features" in body, "multimodal call should attach features" features = body["features"] - assert features["mm_hashes"] == {"image": ["aaa", "bbb"]} + assert features["mm_hashes"] == {"image": ["a" * 32, "b" * 32]} assert features["mm_placeholders"] == { "image": [{"offset": 5, "length": 1}, {"offset": 10, "length": 1}], } - assert "kwargs_data" in features - assert features["kwargs_data"] is not None - assert "image" in features["kwargs_data"] - assert len(features["kwargs_data"]["image"]) == 2 - # Items are base64 strings (encode_mm_kwargs_item output). - for item in features["kwargs_data"]["image"]: - assert isinstance(item, str) and len(item) > 0 + items = features["kwargs_data"]["image"] + assert items[1] is None + assert split_image_ref(items[0]) == ( + "rawtest", + fingerprint, + "image", + "a" * 32, + "image.png", + [1, 2, 2], + ) + assert "raw_image_id" not in result["multi_modal_data"].mm_items["image"][0] # --------------------------------------------------------------------------- From 4bc1766c024da0acdb8f0c2481631fe8b184d43c Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Sat, 20 Jun 2026 07:41:16 +0000 Subject: [PATCH 02/16] Emit generic raw multimodal refs --- renderers/base.py | 6 +- renderers/client.py | 105 ++++++++---------- renderers/configs.py | 42 ++++++- renderers/kimi_k25.py | 249 +++++++++++++++++++++++++++++++++++++++--- renderers/mm_store.py | 151 ++++++++++++++++++------- renderers/qwen3_vl.py | 94 +++++++++++----- tests/test_client.py | 45 +++++--- 7 files changed, 522 insertions(+), 170 deletions(-) diff --git a/renderers/base.py b/renderers/base.py index 4dbb4f4..b6cc1ca 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -204,9 +204,9 @@ class MultiModalData: """Multimodal sidecar produced alongside the token stream. Renderer output is framework-agnostic: ``mm_items[modality][i]`` is a - plain descriptor dict (e.g. ``{"image_grid_thw": [[1, h, w]]}`` for - Qwen-VL images). Translation to engine-specific wire formats — vLLM image - refs, SGLang payloads, etc. — happens in the inference glue layer (see + plain raw descriptor envelope with a model-family key and an adapter-owned + payload. Translation to engine-specific wire formats — vLLM image refs, + SGLang payloads, etc. — happens in the inference glue layer (see ``renderers.client``). """ diff --git a/renderers/client.py b/renderers/client.py index de9df0b..df3dffb 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -172,12 +172,12 @@ async def generate( attribution (``is_content`` / ``sampled_mask`` / ``message_indices`` / ``message_roles``) into the result without re-rendering. - For multimodal renderers (e.g. ``Qwen3VLRenderer``), the call goes + For multimodal renderers, the call goes through ``renderer.render(...)`` to recover the ``multi_modal_data`` sidecar, then serializes it to vLLM's ``features`` schema (mm_hashes, - mm_placeholders, kwargs_data) before POSTing. Qwen-family image - ``kwargs_data`` slots are either ``None`` (cache lookup for a prior - image) or run image refs (new/current images that vLLM should process). + mm_placeholders, kwargs_data) before POSTing. Raw image ``kwargs_data`` + slots are either ``None`` (cache lookup for a prior image) or descriptor + refs (new/current images that vLLM should process). ``max_prompt_len`` controls the pre-flight overflow check. When the rendered prompt is strictly longer than the cap, the request is never @@ -364,7 +364,6 @@ def _descriptor_only_mm_data(mm_data: MultiModalData) -> MultiModalData: "pixel_values", "raw_uri", "raw_image_id", - "image_layout_fingerprint", IMAGE_REF_PAYLOAD_KEY, } } @@ -380,41 +379,18 @@ def _build_vllm_mm_features( """Serialize ``MultiModalData`` to vLLM's ``/inference/v1/generate`` features payload. vLLM's ``MultiModalFeatures`` carries three things: hashes (for cache - lookup), placeholder positions (so the engine knows where in the - token stream each item lives), and per-item payload selectors. For - Qwen images, payload selectors are ``None`` for cache-only prior images - or run image refs for images vLLM should process. - """ - from renderers.qwen3_vl import Qwen3VLRenderer - from renderers.qwen35 import Qwen35Renderer - - # Type dispatch only needs the renderer class. Pools expose - # ``renderer_cls`` as a snapshot attribute, so we don't have to check - # out a slot just to read ``type(r)``. - renderer_cls = ( - renderer.renderer_cls if isinstance(renderer, RendererPool) else type(renderer) - ) - - if issubclass(renderer_cls, (Qwen3VLRenderer, Qwen35Renderer)): - return _build_qwen_vl_image_ref_features(mm_data) - - raise NotImplementedError( - f"Multimodal serialization not implemented for {renderer_cls.__name__}. " - "Add a dispatch branch in renderers.client._build_vllm_mm_features." - ) - - -def _build_qwen_vl_image_ref_features(mm_data: MultiModalData) -> dict[str, Any]: - """vLLM features payload for Qwen-VL image refs. - - Returns ``None`` semantics live one level up — this helper assumes - the caller already verified ``mm_data`` is non-empty. + lookup), placeholder positions (so the engine knows where in the token + stream each item lives), and per-item payload selectors. Raw multimodal + descriptors use the common envelope emitted by renderers; family-specific + geometry stays inside the descriptor payload and is interpreted downstream + by prime-rl/vLLM adapters. """ from renderers.mm_store import ( IMAGE_REF_PAYLOAD_KEY, IMAGE_REF_PAYLOAD_VALUE, + RAW_MM_ITEM_KIND, current_run_id, - image_ref, + raw_mm_ref, ) out: dict[str, Any] = { @@ -423,45 +399,54 @@ def _build_qwen_vl_image_ref_features(mm_data: MultiModalData) -> dict[str, Any] "kwargs_data": {}, } - image_items = mm_data.mm_items.get("image") or [] - if image_items: - mm_hashes = list(mm_data.mm_hashes.get("image") or []) - placeholders = list(mm_data.mm_placeholders.get("image") or []) - if len(mm_hashes) != len(image_items) or len(placeholders) != len(image_items): + run_id = current_run_id() + for source_modality, items in mm_data.mm_items.items(): + if not items: + continue + mm_hashes = list(mm_data.mm_hashes.get(source_modality) or []) + placeholders = list(mm_data.mm_placeholders.get(source_modality) or []) + if len(mm_hashes) != len(items) or len(placeholders) != len(items): raise ValueError( - "Qwen-VL mm sidecar length mismatch: " - f"items={len(image_items)} hashes={len(mm_hashes)} placeholders={len(placeholders)}" + "Multimodal sidecar length mismatch: " + f"modality={source_modality} items={len(items)} " + f"hashes={len(mm_hashes)} placeholders={len(placeholders)}" ) - encoded: list[Any] = [None] * len(image_items) - run_id = current_run_id() - for idx, item in enumerate(image_items): + for idx, item in enumerate(items): + if item.get("kind") != RAW_MM_ITEM_KIND: + raise NotImplementedError( + "Multimodal serialization requires raw descriptor envelopes; " + f"got item keys {sorted(item)} for modality {source_modality!r}." + ) + feature_modality = item.get("vllm_modality") or source_modality + if not isinstance(feature_modality, str) or not feature_modality: + raise ValueError("raw multimodal item has invalid vllm_modality") + out["mm_hashes"].setdefault(feature_modality, []).append(mm_hashes[idx]) + out["mm_placeholders"].setdefault(feature_modality, []).append( + {"offset": placeholders[idx].offset, "length": placeholders[idx].length} + ) + out["kwargs_data"].setdefault(feature_modality, []).append(None) if item.get(IMAGE_REF_PAYLOAD_KEY) != IMAGE_REF_PAYLOAD_VALUE: continue raw_image_id = item.get("raw_image_id") - grid_thw = item.get("image_grid_thw") - fingerprint = item.get("image_layout_fingerprint") + family = item.get("family") + fingerprint = item.get("layout_fingerprint") if not isinstance(raw_image_id, str) or not raw_image_id: - raise ValueError("image-ref multimodal item is missing raw_image_id") - if grid_thw is None: - raise ValueError("image-ref multimodal item is missing image_grid_thw") + raise ValueError("raw multimodal item is missing raw_image_id") + if not isinstance(family, str) or not family: + raise ValueError("raw multimodal item is missing family") if not isinstance(fingerprint, str) or not fingerprint: - raise ValueError("image-ref multimodal item is missing image_layout_fingerprint") - encoded[idx] = image_ref( + raise ValueError("raw multimodal item is missing layout_fingerprint") + out["kwargs_data"][feature_modality][-1] = raw_mm_ref( run_id=run_id, + family=family, fingerprint=fingerprint, - modality="image", + modality=feature_modality, mm_hash=mm_hashes[idx], raw_image_id=raw_image_id, - grid_thw=grid_thw, + payload=item.get("payload") or {}, ) - out["kwargs_data"]["image"] = encoded - out["mm_hashes"]["image"] = mm_hashes - out["mm_placeholders"]["image"] = [ - {"offset": p.offset, "length": p.length} for p in placeholders - ] - if not any(item is not None for values in out["kwargs_data"].values() for item in values): out["kwargs_data"] = None diff --git a/renderers/configs.py b/renderers/configs.py index b07d97e..54ac342 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -31,6 +31,14 @@ QWEN_VL_IMAGE_MIN_PIXELS = 65536 QWEN_VL_IMAGE_MAX_PIXELS = 16777216 +KIMI_K25_IMAGE_PATCH_SIZE = 14 +KIMI_K25_IMAGE_MERGE_KERNEL_SIZE = 2 +KIMI_K25_IMAGE_IN_PATCH_LIMIT = 16384 +KIMI_K25_IMAGE_PATCH_LIMIT_ON_ONE_SIDE = 512 +KIMI_K25_IMAGE_FIXED_OUTPUT_TOKENS: int | None = None +KIMI_K25_IMAGE_MEAN = (0.5, 0.5, 0.5) +KIMI_K25_IMAGE_STD = (0.5, 0.5, 0.5) + class BaseRendererConfig(BaseConfig): """Shared fields and config for every renderer config variant. @@ -362,7 +370,39 @@ class KimiK25RendererConfig(BaseRendererConfig): image_cache_max: int = 256 """FIFO bound on Kimi's per-renderer image processor cache.""" - _internal_fields = frozenset({"image_cache_max"}) + image_patch_size: int = KIMI_K25_IMAGE_PATCH_SIZE + """Kimi MoonViT patch size used to compute raw image layout descriptors.""" + + image_merge_kernel_size: int = KIMI_K25_IMAGE_MERGE_KERNEL_SIZE + """Kimi spatial merge kernel used to compute output media-token layout.""" + + image_in_patch_limit: int = KIMI_K25_IMAGE_IN_PATCH_LIMIT + """Kimi NavIT input patch budget used by image resize layout math.""" + + image_patch_limit_on_one_side: int = KIMI_K25_IMAGE_PATCH_LIMIT_ON_ONE_SIDE + """Kimi per-side patch cap used by image resize layout math.""" + + image_fixed_output_tokens: int | None = KIMI_K25_IMAGE_FIXED_OUTPUT_TOKENS + """Optional fixed Kimi output token count. Current K2.5/K2.6 configs use ``None``.""" + + image_mean: tuple[float, float, float] = KIMI_K25_IMAGE_MEAN + """Kimi image normalization mean, included in processor fingerprints.""" + + image_std: tuple[float, float, float] = KIMI_K25_IMAGE_STD + """Kimi image normalization std, included in processor fingerprints.""" + + _internal_fields = frozenset( + { + "image_cache_max", + "image_patch_size", + "image_merge_kernel_size", + "image_in_patch_limit", + "image_patch_limit_on_one_side", + "image_fixed_output_tokens", + "image_mean", + "image_std", + } + ) class LagunaXS2RendererConfig(BaseRendererConfig): diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index bca4464..a9bbf4a 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -22,7 +22,9 @@ from __future__ import annotations import json +import math import re +from dataclasses import dataclass from typing import Any from transformers.tokenization_utils import PreTrainedTokenizer @@ -44,11 +46,16 @@ from renderers.configs import KimiK25RendererConfig from renderers.parsing import _reasoning_end_token_index, parse_kimi_k2_section from renderers.qwen3_vl import ( + _image_content_hash, + _image_dimensions, _image_hash, + _image_source, _is_image_part, _is_video_part, _load_pil_image, + _raw_uri_and_id, ) +from renderers.mm_store import image_layout_fingerprint, raw_mm_item # --------------------------------------------------------------------------- # Constants @@ -56,6 +63,9 @@ _DEFAULT_SYSTEM_PROMPT = "You are Kimi, an AI assistant created by Moonshot AI." +KIMI_K25_FAMILY = "kimi_k25" +KIMI_K25_VLLM_MODALITY = "vision_chunk" + # --------------------------------------------------------------------------- # TypeScript-style tool declaration # --------------------------------------------------------------------------- @@ -401,6 +411,218 @@ def _encode_tools_typescript(tools: list[ToolSpec]) -> str: return "# Tools\n\n## functions\nnamespace functions {\n" + functions_str + "\n}\n" +@dataclass(frozen=True) +class KimiImageLayoutConfig: + patch_size: int + merge_kernel_size: int + in_patch_limit: int + patch_limit_on_one_side: int + fixed_output_tokens: int | None + image_mean: tuple[float, ...] + image_std: tuple[float, ...] + + +@dataclass(frozen=True) +class KimiImageLayoutDescriptor: + mm_hash: str + grid_thws: list[list[int]] + num_media_tokens: int + fingerprint: str + raw_uri: str | None = None + raw_image_id: str | None = None + + +def kimi_image_layout_config_for_renderer(renderer: Any) -> KimiImageLayoutConfig: + config = renderer.config + values = { + "patch_size": getattr(config, "image_patch_size", None), + "merge_kernel_size": getattr(config, "image_merge_kernel_size", None), + "in_patch_limit": getattr(config, "image_in_patch_limit", None), + "patch_limit_on_one_side": getattr(config, "image_patch_limit_on_one_side", None), + "fixed_output_tokens": getattr(config, "image_fixed_output_tokens", None), + "image_mean": getattr(config, "image_mean", None), + "image_std": getattr(config, "image_std", None), + } + missing = [ + name + for name, value in values.items() + if value is None and name != "fixed_output_tokens" + ] + if missing: + raise RuntimeError( + "Kimi image layout must be declared on the renderer config; missing " + + ", ".join(missing) + ) + return KimiImageLayoutConfig( + patch_size=int(values["patch_size"]), + merge_kernel_size=int(values["merge_kernel_size"]), + in_patch_limit=int(values["in_patch_limit"]), + patch_limit_on_one_side=int(values["patch_limit_on_one_side"]), + fixed_output_tokens=( + None if values["fixed_output_tokens"] is None else int(values["fixed_output_tokens"]) + ), + image_mean=tuple(float(v) for v in values["image_mean"]), + image_std=tuple(float(v) for v in values["image_std"]), + ) + + +def _ceil_to_factor(value: int, factor: int) -> int: + return max(factor, math.ceil(value / factor) * factor) + + +def _kimi_resize_config(width: int, height: int, layout: KimiImageLayoutConfig) -> tuple[int, int, int]: + """Kimi MoonViT/NavIT image resize layout without materializing pixels.""" + if height <= 0 or width <= 0: + raise ValueError(f"image dimensions must be positive, got {height}x{width}") + patch_size = layout.patch_size + patch_limit_pixels = layout.patch_limit_on_one_side * patch_size + s1 = math.sqrt( + layout.in_patch_limit + / ( + max(1.0, width // patch_size) + * max(1.0, height // patch_size) + ) + ) + s2 = patch_limit_pixels / width + s3 = patch_limit_pixels / height + scale = min(1.0, s1, s2, s3) + resized_w = min(max(1, int(width * scale)), patch_limit_pixels) + resized_h = min(max(1, int(height * scale)), patch_limit_pixels) + + factor = layout.merge_kernel_size * patch_size + padded_w = _ceil_to_factor(resized_w, factor) + padded_h = _ceil_to_factor(resized_h, factor) + if layout.fixed_output_tokens is not None: + num_tokens = layout.fixed_output_tokens + else: + num_tokens = (padded_h // factor) * (padded_w // factor) + return padded_w, padded_h, int(num_tokens) + + +def describe_kimi_image_layout(renderer: Any, part: dict[str, Any]) -> KimiImageLayoutDescriptor: + source = _image_source(part) + height, width = _image_dimensions(source) + layout = kimi_image_layout_config_for_renderer(renderer) + padded_w, padded_h, num_media_tokens = _kimi_resize_config(width, height, layout) + grid_thws = [[1, padded_h // layout.patch_size, padded_w // layout.patch_size]] + fingerprint = image_layout_fingerprint( + family=KIMI_K25_FAMILY, + patch_size=layout.patch_size, + merge_kernel_size=layout.merge_kernel_size, + in_patch_limit=layout.in_patch_limit, + patch_limit_on_one_side=layout.patch_limit_on_one_side, + fixed_output_tokens=layout.fixed_output_tokens, + image_mean=list(layout.image_mean), + image_std=list(layout.image_std), + ) + raw_uri, raw_image_id = _raw_uri_and_id(source) + return KimiImageLayoutDescriptor( + mm_hash=_image_content_hash(source), + grid_thws=grid_thws, + num_media_tokens=num_media_tokens, + fingerprint=fingerprint, + raw_uri=raw_uri, + raw_image_id=raw_image_id, + ) + + +def kimi_image_item_for_render(renderer: Any, part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: + desc = describe_kimi_image_layout(renderer, part) + item = raw_mm_item( + modality="image", + family=KIMI_K25_FAMILY, + layout_fingerprint=desc.fingerprint, + payload={ + "grid_thws": desc.grid_thws, + "num_media_tokens": desc.num_media_tokens, + }, + raw_uri=desc.raw_uri, + raw_image_id=desc.raw_image_id, + vllm_modality=KIMI_K25_VLLM_MODALITY, + ) + return 1, desc.mm_hash, item + + +def _kimi_grid_from_item(item: dict[str, Any]) -> Any: + payload = item.get("payload") + if isinstance(payload, dict) and payload.get("grid_thws") is not None: + return payload["grid_thws"] + return item.get("grid_thws") + + +def _kimi_grids_equal(a: Any, b: Any) -> bool: + if a is None or b is None: + return False + al = a.tolist() if hasattr(a, "tolist") else a + bl = b.tolist() if hasattr(b, "tolist") else b + return al == bl + + +def materialize_kimi_image_refs(renderer: Any, mm_data: MultiModalData, messages: list[Message]) -> MultiModalData: + """Attach run-image refs to every Kimi image descriptor that can be found.""" + from dataclasses import replace + + image_items = mm_data.mm_items.get("image") or [] + if not image_items: + return mm_data + hashes = mm_data.mm_hashes.get("image") or [] + if len(hashes) != len(image_items): + raise ValueError( + "materialize_kimi_image_refs: mm_hashes/mm_items length mismatch " + f"({len(hashes)} vs {len(image_items)})" + ) + + missing = set(hashes) + resolved: dict[str, KimiImageLayoutDescriptor] = {} + for msg in messages or []: + content = msg.get("content") if isinstance(msg, dict) else None + if not isinstance(content, list): + continue + for part in content: + if not missing: + break + if not (isinstance(part, dict) and _is_image_part(part)): + continue + desc = describe_kimi_image_layout(renderer, part) + if desc.mm_hash in missing: + resolved[desc.mm_hash] = desc + missing.discard(desc.mm_hash) + if missing: + raise ValueError( + f"materialize_kimi_image_refs: {len(missing)} image hash(es) not found in messages" + ) + + new_image_items: list[dict[str, Any]] = [] + for i, item in enumerate(image_items): + desc = resolved[hashes[i]] + if desc.raw_uri is None or desc.raw_image_id is None: + raise ValueError("materialize_kimi_image_refs requires file-backed image URLs") + item_grid = _kimi_grid_from_item(item) + if item_grid is not None and not _kimi_grids_equal(desc.grid_thws, item_grid): + raise ValueError( + "materialize_kimi_image_refs: reconstructed grid_thws " + f"{desc.grid_thws!r} != descriptor {item_grid!r}" + ) + new_image_items.append( + raw_mm_item( + modality="image", + family=KIMI_K25_FAMILY, + layout_fingerprint=desc.fingerprint, + payload={ + "grid_thws": item_grid if item_grid is not None else desc.grid_thws, + "num_media_tokens": desc.num_media_tokens, + }, + raw_uri=desc.raw_uri, + raw_image_id=desc.raw_image_id, + vllm_modality=KIMI_K25_VLLM_MODALITY, + ) + ) + + new_items = dict(mm_data.mm_items) + new_items["image"] = new_image_items + return replace(mm_data, mm_items=new_items) + + # --------------------------------------------------------------------------- # Kimi K2.5 response parsing (mirrors K2 format, same token structure) # --------------------------------------------------------------------------- @@ -647,6 +869,11 @@ def mm_token_type_id_map(self) -> dict[int, int]: internally from ``pixel_values``.""" return {self._media_pad: 1} + def materialize_image_refs( + self, mm_data: MultiModalData, messages: list[Message] + ) -> MultiModalData: + return materialize_kimi_image_refs(self, mm_data, messages) + def _get_processor(self): if self._processor is not None: return self._processor @@ -815,7 +1042,7 @@ def emit_image( ``<|media_content|>``, ``<|media_end|>``, the trailing ``\\n``) are template-injected scaffold. """ - _, out, _num_patches, h = self._process_image(part) + _placeholder_len, h, mm_item = kimi_image_item_for_render(self, part) emit_special( self._media_begin, msg_idx, is_sampled=is_sampled, is_content=False ) @@ -838,16 +1065,7 @@ def emit_image( mm_placeholders.setdefault("image", []).append( PlaceholderRange(offset=offset, length=1) ) - # ``grid_thws`` (Kimi) is the per-image equivalent of Qwen-VL's - # ``image_grid_thw``. Ship under Kimi's native key so the - # orchestrator's generic ``torch.cat``-based packer routes it - # directly into the model's forward kwargs. - mm_items.setdefault("image", []).append( - { - "pixel_values": out["pixel_values"], - "grid_thws": out["grid_thws"], - } - ) + mm_items.setdefault("image", []).append(mm_item) # ── Tool declaration prefix (comes first) ── # K2.5/K2.6's tokenizer auto-computes ``tools_ts_str`` and threads @@ -1110,7 +1328,7 @@ def emit_image( is_sampled: bool = False, is_content: bool = False, ) -> None: - _, out, _num_patches, h = self._process_image(part) + _placeholder_len, h, mm_item = kimi_image_item_for_render(self, part) emit_special(self._media_begin, msg_idx) emit_text("image", msg_idx) emit_special(self._media_content, msg_idx) @@ -1124,12 +1342,7 @@ def emit_image( new_placeholders.setdefault("image", []).append( PlaceholderRange(offset=offset, length=1) ) - new_items.setdefault("image", []).append( - { - "pixel_values": out["pixel_values"], - "grid_thws": out["grid_thws"], - } - ) + new_items.setdefault("image", []).append(mm_item) # Bridge handles user/system/tool only (reject_assistant_in_extension # blocks assistants), so no hist/suffix split needed. diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 8cabd14..a27ea8a 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -10,9 +10,11 @@ import base64 import hashlib +import json import os import re import threading +from dataclasses import dataclass from pathlib import Path RUN_OUTPUT_ROOT = Path("/data/outputs") @@ -22,15 +24,19 @@ RUN_ID_ENV = "RUN_ID" IMAGE_ASSET_SUBDIR = Path("assets/images") -IMAGE_REF_PREFIX = "mmraw:v1" +IMAGE_REF_PREFIX = "mmraw:v2" IMAGE_REF_PAYLOAD_KEY = "_prime_rl_image_ref" IMAGE_REF_PAYLOAD_VALUE = "raw_image" +RAW_MM_ITEM_KIND = "prime_raw_mm_item" +RAW_MM_ITEM_VERSION = 1 _SAFE_RUN_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +_SAFE_FAMILY_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +_SAFE_MODALITY_RE = re.compile(r"^[A-Za-z0-9_.-]+$") _SAFE_FINGERPRINT_RE = re.compile(r"^[a-f0-9]{16,64}$") _SAFE_MM_HASH_RE = re.compile(r"^[a-f0-9]{16,128}$") _SAFE_IMAGE_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") -_SAFE_GRID_THW_RE = re.compile(r"^[0-9]+x[0-9]+x[0-9]+$") +_SAFE_REF_PAYLOAD_RE = re.compile(r"^[A-Za-z0-9_-]*$") _MEDIA_TYPE_EXT = {"jpeg": ".jpg", "jpg": ".jpg", "png": ".png", "webp": ".webp", "gif": ".gif"} @@ -156,67 +162,132 @@ def raw_image_path(*, run_id: str, raw_image_id: str) -> Path: return path -def image_layout_fingerprint( - *, - family: str, - patch_size: int, - merge_size: int, - temporal_patch_size: int, - min_pixels: int, - max_pixels: int, -) -> str: - raw = ( - f"image-layout:v1:{family}:{int(patch_size)}:{int(merge_size)}:" - f"{int(temporal_patch_size)}:{int(min_pixels)}:{int(max_pixels)}" - ).encode("utf-8") - return hashlib.sha256(raw).hexdigest()[:32] +def _json_fingerprint_value(value: object) -> str: + return json.dumps(value, sort_keys=True, separators=(",", ":"), default=str) -def _grid_to_ref(grid_thw: object) -> str: - data = grid_thw.tolist() if hasattr(grid_thw, "tolist") else grid_thw - if isinstance(data, list) and data and isinstance(data[0], list): - data = data[0] - if not isinstance(data, (list, tuple)) or len(data) != 3: - raise ValueError(f"Invalid image grid_thw for image ref: {grid_thw!r}") - return "x".join(str(int(v)) for v in data) - - -def _grid_from_ref(value: str) -> list[int]: - if not _SAFE_GRID_THW_RE.fullmatch(value): - raise ValueError(f"Invalid image grid_thw ref segment: {value!r}") - return [int(v) for v in value.split("x")] +def image_layout_fingerprint(*, family: str, **values: object) -> str: + """Stable adapter-owned fingerprint for raw multimodal layout contracts.""" + if not _SAFE_FAMILY_RE.fullmatch(family): + raise ValueError(f"Invalid multimodal family: {family!r}") + encoded_values = ":".join(f"{key}={_json_fingerprint_value(values[key])}" for key in sorted(values)) + raw = f"image-layout:v1:{family}:{encoded_values}".encode("utf-8") + return hashlib.sha256(raw).hexdigest()[:32] -def image_ref( +def raw_mm_item( + *, + modality: str, + family: str, + layout_fingerprint: str, + payload: dict[str, object], + raw_uri: str | None = None, + raw_image_id: str | None = None, + vllm_modality: str | None = None, +) -> dict[str, object]: + """Build the JSON-safe raw multimodal descriptor envelope. + + ``payload`` is intentionally adapter-owned. Shared consumers may route by + ``family`` and validate the common envelope, but must not inspect adapter + payload keys. + """ + if not _SAFE_FAMILY_RE.fullmatch(family): + raise ValueError(f"Invalid multimodal family: {family!r}") + if not _SAFE_MODALITY_RE.fullmatch(modality): + raise ValueError(f"Invalid raw multimodal modality: {modality!r}") + if not _SAFE_FINGERPRINT_RE.fullmatch(layout_fingerprint): + raise ValueError(f"Invalid image layout fingerprint: {layout_fingerprint!r}") + out: dict[str, object] = { + "kind": RAW_MM_ITEM_KIND, + "version": RAW_MM_ITEM_VERSION, + "modality": modality, + "family": family, + "layout_fingerprint": layout_fingerprint, + "payload": payload, + } + if vllm_modality is not None: + out["vllm_modality"] = vllm_modality + if raw_uri is not None and raw_image_id is not None: + out.update( + { + "raw_uri": raw_uri, + "raw_image_id": raw_image_id, + IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, + } + ) + return out + + +@dataclass(frozen=True) +class RawMMRef: + run_id: str + family: str + fingerprint: str + modality: str + mm_hash: str + raw_image_id: str + payload: dict[str, object] + + +def raw_mm_ref( *, run_id: str, + family: str, fingerprint: str, modality: str, mm_hash: str, raw_image_id: str, - grid_thw: object, + payload: dict[str, object] | None = None, ) -> str: + """Generic raw multimodal asset ref. + + Adapter-owned details stay in the descriptor payload so refs can serve + future families without baking shape names into the wire id. + """ run_id = normalize_run_id(run_id) + if not _SAFE_FAMILY_RE.fullmatch(family): + raise ValueError(f"Invalid multimodal family: {family!r}") if not _SAFE_FINGERPRINT_RE.fullmatch(fingerprint): raise ValueError(f"Invalid image layout fingerprint: {fingerprint!r}") - if modality != "image": - raise ValueError(f"Unsupported image ref modality: {modality!r}") + if not _SAFE_MODALITY_RE.fullmatch(modality): + raise ValueError(f"Invalid raw multimodal modality: {modality!r}") if not _SAFE_MM_HASH_RE.fullmatch(mm_hash): raise ValueError(f"Invalid image hash: {mm_hash!r}") raw_image_path(run_id=run_id, raw_image_id=raw_image_id) - return f"{IMAGE_REF_PREFIX}:{run_id}:{fingerprint}:{modality}:{mm_hash}:{raw_image_id}:{_grid_to_ref(grid_thw)}" + encoded_payload = base64.urlsafe_b64encode( + json.dumps(payload or {}, sort_keys=True, separators=(",", ":")).encode("utf-8") + ).decode("ascii").rstrip("=") + return ( + f"{IMAGE_REF_PREFIX}:{run_id}:{family}:{fingerprint}:" + f"{modality}:{mm_hash}:{raw_image_id}:{encoded_payload}" + ) -def split_image_ref(ref: str) -> tuple[str, str, str, str, str, list[int]]: +def split_raw_mm_ref(ref: str) -> RawMMRef: parts = ref.split(":") - if parts[:2] != ["mmraw", "v1"] or len(parts) != 8: - raise ValueError(f"Invalid image ref shape: {ref!r}") - return normalize_run_id(parts[2]), parts[3], parts[4], parts[5], parts[6], _grid_from_ref(parts[7]) + if parts[:2] != ["mmraw", "v2"] or len(parts) != 9: + raise ValueError(f"Invalid raw multimodal ref shape: {ref!r}") + run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = parts[2:] + if not _SAFE_REF_PAYLOAD_RE.fullmatch(encoded_payload): + raise ValueError("Invalid raw multimodal ref payload segment") + padded = encoded_payload + "=" * (-len(encoded_payload) % 4) + payload = json.loads(base64.urlsafe_b64decode(padded.encode("ascii")).decode("utf-8")) + if not isinstance(payload, dict): + raise ValueError("Raw multimodal ref payload must decode to a dict") + return RawMMRef( + run_id=normalize_run_id(run_id), + family=family, + fingerprint=fingerprint, + modality=modality, + mm_hash=mm_hash, + raw_image_id=raw_image_id, + payload=payload, + ) # Backwards-compatible names for consumers that already speak the mmraw wire format. MMRAW_PREFIX = IMAGE_REF_PREFIX MM_RAW_PAYLOAD_KEY = IMAGE_REF_PAYLOAD_KEY MM_RAW_PAYLOAD_VALUE = IMAGE_REF_PAYLOAD_VALUE -mmraw_ref = image_ref -split_mmraw_ref = split_image_ref +mmraw_ref = raw_mm_ref +split_mmraw_ref = split_raw_mm_ref diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 9b865d0..1cde900 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -52,6 +52,7 @@ IMAGE_REF_PAYLOAD_KEY, IMAGE_REF_PAYLOAD_VALUE, image_layout_fingerprint, + raw_mm_item, ) from renderers.parsing import parse_qwen3 @@ -328,16 +329,14 @@ def describe_qwen_image_layout(renderer: Any, part: dict[str, Any]) -> QwenImage def qwen_image_item_for_render(renderer: Any, part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: desc = describe_qwen_image_layout(renderer, part) - item: dict[str, Any] = {"image_grid_thw": desc.image_grid_thw} - if desc.raw_uri is not None and desc.raw_image_id is not None: - item.update( - { - "raw_uri": desc.raw_uri, - "raw_image_id": desc.raw_image_id, - "image_layout_fingerprint": desc.fingerprint, - IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, - } - ) + item = raw_mm_item( + modality="image", + family="qwen_vl", + layout_fingerprint=desc.fingerprint, + payload={"image_grid_thw": desc.image_grid_thw}, + raw_uri=desc.raw_uri, + raw_image_id=desc.raw_image_id, + ) return desc.num_image_tokens, desc.mm_hash, item @@ -359,6 +358,54 @@ def _grids_equal(a: Any, b: Any) -> bool: return al == bl +def _qwen_grid_from_item(item: dict[str, Any]) -> Any: + payload = item.get("payload") + if isinstance(payload, dict) and payload.get("image_grid_thw") is not None: + return payload["image_grid_thw"] + return item.get("image_grid_thw") + + +def _qwen_item_with_grid_and_ref( + item: dict[str, Any], + *, + image_grid_thw: Any, + fingerprint: str, + raw_uri: str, + raw_image_id: str, +) -> dict[str, Any]: + new_item = { + k: v + for k, v in item.items() + if k + not in { + "raw_uri", + "raw_image_id", + "image_layout_fingerprint", + IMAGE_REF_PAYLOAD_KEY, + } + } + if new_item.get("family") == "qwen_vl" and isinstance(new_item.get("payload"), dict): + payload = dict(new_item["payload"]) + payload["image_grid_thw"] = image_grid_thw + new_item["payload"] = payload + new_item["layout_fingerprint"] = fingerprint + else: + new_item = raw_mm_item( + modality="image", + family="qwen_vl", + layout_fingerprint=fingerprint, + payload={"image_grid_thw": image_grid_thw}, + ) + new_item.update( + { + "raw_uri": raw_uri, + "raw_image_id": raw_image_id, + IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, + } + ) + return new_item + + def materialize_image_refs(renderer: Any, mm_data: MultiModalData, messages: list[Message]) -> MultiModalData: """Attach run-image refs to every Qwen image descriptor that can be found.""" image_items = mm_data.mm_items.get("image") or [] @@ -390,31 +437,18 @@ def materialize_image_refs(renderer: Any, mm_data: MultiModalData, messages: lis desc = resolved[hashes[i]] if desc.raw_uri is None or desc.raw_image_id is None: raise ValueError("materialize_image_refs requires file-backed image URLs") - item_grid = item.get("image_grid_thw") + item_grid = _qwen_grid_from_item(item) if item_grid is not None and not _grids_equal(desc.image_grid_thw, item_grid): raise ValueError( "materialize_image_refs: reconstructed image_grid_thw " f"{desc.image_grid_thw!r} != descriptor {item_grid!r}" ) - new_item = { - k: v - for k, v in item.items() - if k - not in { - "raw_uri", - "raw_image_id", - "image_layout_fingerprint", - IMAGE_REF_PAYLOAD_KEY, - } - } - new_item.update( - { - "image_grid_thw": item_grid if item_grid is not None else desc.image_grid_thw, - "raw_uri": desc.raw_uri, - "raw_image_id": desc.raw_image_id, - "image_layout_fingerprint": desc.fingerprint, - IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, - } + new_item = _qwen_item_with_grid_and_ref( + item, + image_grid_thw=item_grid if item_grid is not None else desc.image_grid_thw, + fingerprint=desc.fingerprint, + raw_uri=desc.raw_uri, + raw_image_id=desc.raw_image_id, ) new_image_items.append(new_item) diff --git a/tests/test_client.py b/tests/test_client.py index c1c7aaf..ac0ec16 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -169,7 +169,8 @@ def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): item = rendered.multi_modal_data.mm_items["image"][0] assert "pixel_values" not in item - assert item["image_grid_thw"] == [[1, 16, 16]] + assert item["family"] == "qwen_vl" + assert item["payload"]["image_grid_thw"] == [[1, 16, 16]] assert item["raw_image_id"] == "image.png" assert item[IMAGE_REF_PAYLOAD_KEY] == IMAGE_REF_PAYLOAD_VALUE assert rendered.multi_modal_data.mm_placeholders["image"][0].length == 64 @@ -180,7 +181,7 @@ def test_generate_materialize_all_image_refs_rehydrates_descriptor_slots(tmp_pat from PIL import Image from renderers.base import MultiModalData, ParsedResponse, PlaceholderRange - from renderers.mm_store import split_image_ref + from renderers.mm_store import split_raw_mm_ref from renderers.qwen3_vl import Qwen3VLRenderer class _RetryRenderer(Qwen3VLRenderer): @@ -226,14 +227,14 @@ def parse_response(self, completion_ids, *, tools=None): ) ) - ref = client.calls[0]["body"]["features"]["kwargs_data"]["image"][0] - run_id, _fingerprint, modality, parsed_hash, raw_image_id, grid = split_image_ref(ref) - assert (run_id, modality, parsed_hash, raw_image_id, grid) == ( + ref_item = client.calls[0]["body"]["features"]["kwargs_data"]["image"][0] + ref = split_raw_mm_ref(ref_item) + assert ref.payload["image_grid_thw"] == [[1, 16, 16]] + assert (ref.run_id, ref.modality, ref.mm_hash, ref.raw_image_id) == ( "retry", "image", mm_hash, "image.png", - [1, 16, 16], ) @@ -439,10 +440,9 @@ def test_generate_serializes_image_refs_for_qwen_vl_family( PlaceholderRange, ) from renderers.mm_store import ( - IMAGE_REF_PAYLOAD_KEY, - IMAGE_REF_PAYLOAD_VALUE, image_layout_fingerprint, - split_image_ref, + raw_mm_item, + split_raw_mm_ref, ) mod_name, cls_name = renderer_class_path.split(":") @@ -482,13 +482,20 @@ def parse_response(self, completion_ids, *, tools=None): }, mm_items={ "image": [ - { - "image_grid_thw": [[1, 2, 2]], - "raw_image_id": "image.png", - "image_layout_fingerprint": fingerprint, - IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, - }, - {"image_grid_thw": [[1, 2, 2]]}, + raw_mm_item( + modality="image", + family="qwen_vl", + layout_fingerprint=fingerprint, + payload={"image_grid_thw": [[1, 2, 2]]}, + raw_uri=(image_dir / "image.png").as_uri(), + raw_image_id="image.png", + ), + raw_mm_item( + modality="image", + family="qwen_vl", + layout_fingerprint=fingerprint, + payload={"image_grid_thw": [[1, 2, 2]]}, + ), ], }, ) @@ -515,13 +522,15 @@ def parse_response(self, completion_ids, *, tools=None): } items = features["kwargs_data"]["image"] assert items[1] is None - assert split_image_ref(items[0]) == ( + ref = split_raw_mm_ref(items[0]) + assert ref.payload == {"image_grid_thw": [[1, 2, 2]]} + assert (ref.run_id, ref.family, ref.fingerprint, ref.modality, ref.mm_hash, ref.raw_image_id) == ( "rawtest", + "qwen_vl", fingerprint, "image", "a" * 32, "image.png", - [1, 2, 2], ) assert "raw_image_id" not in result["multi_modal_data"].mm_items["image"][0] From a8f43867bd02078f876c9c777bc7a6803814ad71 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Thu, 25 Jun 2026 06:46:26 +0000 Subject: [PATCH 03/16] Fix raw image renderer style checks --- renderers/client.py | 17 +++++++++++++---- renderers/kimi_k25.py | 41 +++++++++++++++++++++++++++-------------- renderers/mm_store.py | 40 +++++++++++++++++++++++++++++++--------- renderers/qwen3_vl.py | 20 +++++++++++++++----- tests/test_client.py | 41 ++++++++++++++++++++++++++++++----------- 5 files changed, 116 insertions(+), 43 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index df3dffb..25ab763 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -252,7 +252,9 @@ def _prepare(): "sampling_params": sp, } - def _features_and_descriptor_mm() -> tuple[dict[str, Any] | None, MultiModalData | None]: + def _features_and_descriptor_mm() -> tuple[ + dict[str, Any] | None, MultiModalData | None + ]: if mm_data is None or mm_data.is_empty(): return None, mm_data build_mm = mm_data @@ -263,10 +265,15 @@ def _features_and_descriptor_mm() -> tuple[dict[str, Any] | None, MultiModalData f"{type(renderer).__name__} cannot materialize image refs for retry." ) build_mm = materialize(mm_data, messages) - return _build_vllm_mm_features(renderer, build_mm), _descriptor_only_mm_data(mm_data) + return _build_vllm_mm_features(renderer, build_mm), _descriptor_only_mm_data( + mm_data + ) features, out_mm_data = await _maybe_offload(renderer, _features_and_descriptor_mm) - if prompt_attr is not None and getattr(prompt_attr, "multi_modal_data", None) is not None: + if ( + prompt_attr is not None + and getattr(prompt_attr, "multi_modal_data", None) is not None + ): prompt_attr = replace(prompt_attr, multi_modal_data=out_mm_data) if features is not None: body["features"] = features @@ -447,7 +454,9 @@ def _build_vllm_mm_features( payload=item.get("payload") or {}, ) - if not any(item is not None for values in out["kwargs_data"].values() for item in values): + if not any( + item is not None for values in out["kwargs_data"].values() for item in values + ): out["kwargs_data"] = None return out diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index a9bbf4a..b2cc234 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -25,7 +25,7 @@ import math import re from dataclasses import dataclass -from typing import Any +from typing import Any, cast from transformers.tokenization_utils import PreTrainedTokenizer @@ -438,7 +438,9 @@ def kimi_image_layout_config_for_renderer(renderer: Any) -> KimiImageLayoutConfi "patch_size": getattr(config, "image_patch_size", None), "merge_kernel_size": getattr(config, "image_merge_kernel_size", None), "in_patch_limit": getattr(config, "image_in_patch_limit", None), - "patch_limit_on_one_side": getattr(config, "image_patch_limit_on_one_side", None), + "patch_limit_on_one_side": getattr( + config, "image_patch_limit_on_one_side", None + ), "fixed_output_tokens": getattr(config, "image_fixed_output_tokens", None), "image_mean": getattr(config, "image_mean", None), "image_std": getattr(config, "image_std", None), @@ -453,16 +455,20 @@ def kimi_image_layout_config_for_renderer(renderer: Any) -> KimiImageLayoutConfi "Kimi image layout must be declared on the renderer config; missing " + ", ".join(missing) ) + image_mean = cast("tuple[float, ...] | list[float]", values["image_mean"]) + image_std = cast("tuple[float, ...] | list[float]", values["image_std"]) return KimiImageLayoutConfig( patch_size=int(values["patch_size"]), merge_kernel_size=int(values["merge_kernel_size"]), in_patch_limit=int(values["in_patch_limit"]), patch_limit_on_one_side=int(values["patch_limit_on_one_side"]), fixed_output_tokens=( - None if values["fixed_output_tokens"] is None else int(values["fixed_output_tokens"]) + None + if values["fixed_output_tokens"] is None + else int(values["fixed_output_tokens"]) ), - image_mean=tuple(float(v) for v in values["image_mean"]), - image_std=tuple(float(v) for v in values["image_std"]), + image_mean=tuple(float(v) for v in image_mean), + image_std=tuple(float(v) for v in image_std), ) @@ -470,7 +476,9 @@ def _ceil_to_factor(value: int, factor: int) -> int: return max(factor, math.ceil(value / factor) * factor) -def _kimi_resize_config(width: int, height: int, layout: KimiImageLayoutConfig) -> tuple[int, int, int]: +def _kimi_resize_config( + width: int, height: int, layout: KimiImageLayoutConfig +) -> tuple[int, int, int]: """Kimi MoonViT/NavIT image resize layout without materializing pixels.""" if height <= 0 or width <= 0: raise ValueError(f"image dimensions must be positive, got {height}x{width}") @@ -478,10 +486,7 @@ def _kimi_resize_config(width: int, height: int, layout: KimiImageLayoutConfig) patch_limit_pixels = layout.patch_limit_on_one_side * patch_size s1 = math.sqrt( layout.in_patch_limit - / ( - max(1.0, width // patch_size) - * max(1.0, height // patch_size) - ) + / (max(1.0, width // patch_size) * max(1.0, height // patch_size)) ) s2 = patch_limit_pixels / width s3 = patch_limit_pixels / height @@ -499,7 +504,9 @@ def _kimi_resize_config(width: int, height: int, layout: KimiImageLayoutConfig) return padded_w, padded_h, int(num_tokens) -def describe_kimi_image_layout(renderer: Any, part: dict[str, Any]) -> KimiImageLayoutDescriptor: +def describe_kimi_image_layout( + renderer: Any, part: dict[str, Any] +) -> KimiImageLayoutDescriptor: source = _image_source(part) height, width = _image_dimensions(source) layout = kimi_image_layout_config_for_renderer(renderer) @@ -526,7 +533,9 @@ def describe_kimi_image_layout(renderer: Any, part: dict[str, Any]) -> KimiImage ) -def kimi_image_item_for_render(renderer: Any, part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: +def kimi_image_item_for_render( + renderer: Any, part: dict[str, Any] +) -> tuple[int, str, dict[str, Any]]: desc = describe_kimi_image_layout(renderer, part) item = raw_mm_item( modality="image", @@ -558,7 +567,9 @@ def _kimi_grids_equal(a: Any, b: Any) -> bool: return al == bl -def materialize_kimi_image_refs(renderer: Any, mm_data: MultiModalData, messages: list[Message]) -> MultiModalData: +def materialize_kimi_image_refs( + renderer: Any, mm_data: MultiModalData, messages: list[Message] +) -> MultiModalData: """Attach run-image refs to every Kimi image descriptor that can be found.""" from dataclasses import replace @@ -596,7 +607,9 @@ def materialize_kimi_image_refs(renderer: Any, mm_data: MultiModalData, messages for i, item in enumerate(image_items): desc = resolved[hashes[i]] if desc.raw_uri is None or desc.raw_image_id is None: - raise ValueError("materialize_kimi_image_refs requires file-backed image URLs") + raise ValueError( + "materialize_kimi_image_refs requires file-backed image URLs" + ) item_grid = _kimi_grid_from_item(item) if item_grid is not None and not _kimi_grids_equal(desc.grid_thws, item_grid): raise ValueError( diff --git a/renderers/mm_store.py b/renderers/mm_store.py index a27ea8a..827851b 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -38,7 +38,13 @@ _SAFE_IMAGE_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") _SAFE_REF_PAYLOAD_RE = re.compile(r"^[A-Za-z0-9_-]*$") -_MEDIA_TYPE_EXT = {"jpeg": ".jpg", "jpg": ".jpg", "png": ".png", "webp": ".webp", "gif": ".gif"} +_MEDIA_TYPE_EXT = { + "jpeg": ".jpg", + "jpg": ".jpg", + "png": ".png", + "webp": ".webp", + "gif": ".gif", +} def normalize_run_id(run_id: str) -> str: @@ -96,7 +102,9 @@ def run_dir(run_id: str | None = None) -> Path: value = run_id or os.getenv(RUN_ID_ENV, "").strip() if not value: - raise RuntimeError(f"Set {RUN_DIR_ENV} or {RUN_ID_ENV} before resolving a run directory.") + raise RuntimeError( + f"Set {RUN_DIR_ENV} or {RUN_ID_ENV} before resolving a run directory." + ) return (RUN_OUTPUT_ROOT / run_dir_name(value)).resolve() @@ -118,7 +126,9 @@ def _media_type_ext(media_type: str) -> str: return _MEDIA_TYPE_EXT.get(subtype, ".img") -def offload_image_to_run_assets(url: object, image_dir: Path | None = None) -> tuple[str, int] | None: +def offload_image_to_run_assets( + url: object, image_dir: Path | None = None +) -> tuple[str, int] | None: """Decode a base64 data image into the run image assets directory. Returns ``(file_url, byte_count)`` when ``url`` was rewritten and ``None`` @@ -170,7 +180,9 @@ def image_layout_fingerprint(*, family: str, **values: object) -> str: """Stable adapter-owned fingerprint for raw multimodal layout contracts.""" if not _SAFE_FAMILY_RE.fullmatch(family): raise ValueError(f"Invalid multimodal family: {family!r}") - encoded_values = ":".join(f"{key}={_json_fingerprint_value(values[key])}" for key in sorted(values)) + encoded_values = ":".join( + f"{key}={_json_fingerprint_value(values[key])}" for key in sorted(values) + ) raw = f"image-layout:v1:{family}:{encoded_values}".encode("utf-8") return hashlib.sha256(raw).hexdigest()[:32] @@ -254,9 +266,15 @@ def raw_mm_ref( if not _SAFE_MM_HASH_RE.fullmatch(mm_hash): raise ValueError(f"Invalid image hash: {mm_hash!r}") raw_image_path(run_id=run_id, raw_image_id=raw_image_id) - encoded_payload = base64.urlsafe_b64encode( - json.dumps(payload or {}, sort_keys=True, separators=(",", ":")).encode("utf-8") - ).decode("ascii").rstrip("=") + encoded_payload = ( + base64.urlsafe_b64encode( + json.dumps(payload or {}, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + ) + .decode("ascii") + .rstrip("=") + ) return ( f"{IMAGE_REF_PREFIX}:{run_id}:{family}:{fingerprint}:" f"{modality}:{mm_hash}:{raw_image_id}:{encoded_payload}" @@ -267,11 +285,15 @@ def split_raw_mm_ref(ref: str) -> RawMMRef: parts = ref.split(":") if parts[:2] != ["mmraw", "v2"] or len(parts) != 9: raise ValueError(f"Invalid raw multimodal ref shape: {ref!r}") - run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = parts[2:] + run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = ( + parts[2:] + ) if not _SAFE_REF_PAYLOAD_RE.fullmatch(encoded_payload): raise ValueError("Invalid raw multimodal ref payload segment") padded = encoded_payload + "=" * (-len(encoded_payload) % 4) - payload = json.loads(base64.urlsafe_b64decode(padded.encode("ascii")).decode("utf-8")) + payload = json.loads( + base64.urlsafe_b64decode(padded.encode("ascii")).decode("utf-8") + ) if not isinstance(payload, dict): raise ValueError("Raw multimodal ref payload must decode to a dict") return RawMMRef( diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 1cde900..4e51b9e 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -292,7 +292,9 @@ def _raw_uri_and_id(source: Any) -> tuple[str | None, str | None]: return path.as_uri(), path.name -def describe_qwen_image_layout(renderer: Any, part: dict[str, Any]) -> QwenImageLayoutDescriptor: +def describe_qwen_image_layout( + renderer: Any, part: dict[str, Any] +) -> QwenImageLayoutDescriptor: """Return Qwen image layout metadata without invoking an image processor.""" source = _image_source(part) height, width = _image_dimensions(source) @@ -307,7 +309,9 @@ def describe_qwen_image_layout(renderer: Any, part: dict[str, Any]) -> QwenImage grid_t = 1 grid_h = resized_h // layout.patch_size grid_w = resized_w // layout.patch_size - num_image_tokens = grid_t * grid_h * grid_w // (layout.merge_size * layout.merge_size) + num_image_tokens = ( + grid_t * grid_h * grid_w // (layout.merge_size * layout.merge_size) + ) fingerprint = image_layout_fingerprint( family="qwen_vl", patch_size=layout.patch_size, @@ -327,7 +331,9 @@ def describe_qwen_image_layout(renderer: Any, part: dict[str, Any]) -> QwenImage ) -def qwen_image_item_for_render(renderer: Any, part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: +def qwen_image_item_for_render( + renderer: Any, part: dict[str, Any] +) -> tuple[int, str, dict[str, Any]]: desc = describe_qwen_image_layout(renderer, part) item = raw_mm_item( modality="image", @@ -384,7 +390,9 @@ def _qwen_item_with_grid_and_ref( IMAGE_REF_PAYLOAD_KEY, } } - if new_item.get("family") == "qwen_vl" and isinstance(new_item.get("payload"), dict): + if new_item.get("family") == "qwen_vl" and isinstance( + new_item.get("payload"), dict + ): payload = dict(new_item["payload"]) payload["image_grid_thw"] = image_grid_thw new_item["payload"] = payload @@ -406,7 +414,9 @@ def _qwen_item_with_grid_and_ref( return new_item -def materialize_image_refs(renderer: Any, mm_data: MultiModalData, messages: list[Message]) -> MultiModalData: +def materialize_image_refs( + renderer: Any, mm_data: MultiModalData, messages: list[Message] +) -> MultiModalData: """Attach run-image refs to every Qwen image descriptor that can be found.""" image_items = mm_data.mm_items.get("image") or [] if not image_items: diff --git a/tests/test_client.py b/tests/test_client.py index ac0ec16..f14821e 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -2,6 +2,7 @@ import base64 import hashlib import json +from typing import Any import httpx import numpy as np @@ -10,17 +11,20 @@ ParsedResponse, ParsedToolCall, RenderedTokens, + ToolSpec, ToolCallParseStatus, ) from renderers.client import generate +_OPENAI_TOOL: Any = {"type": "function", "function": {"name": "echo"}} + class _FakeRenderer: supports_tools = True def render(self, messages, *, tools=None, add_generation_prompt=False): assert messages == [{"role": "user", "content": "hi"}] - assert tools == [{"type": "function", "function": {"name": "echo"}}] + assert tools == [_OPENAI_TOOL] assert add_generation_prompt is True # Populate the full attribution surface so the test can verify # ``generate`` threads it through to the result dict unchanged. @@ -161,12 +165,15 @@ def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): [ { "role": "user", - "content": [{"type": "image_url", "image_url": {"url": image_path.as_uri()}}], + "content": [ + {"type": "image_url", "image_url": {"url": image_path.as_uri()}} + ], } ], add_generation_prompt=True, ) + assert rendered.multi_modal_data is not None item = rendered.multi_modal_data.mm_items["image"][0] assert "pixel_values" not in item assert item["family"] == "qwen_vl" @@ -176,7 +183,9 @@ def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): assert rendered.multi_modal_data.mm_placeholders["image"][0].length == 64 -def test_generate_materialize_all_image_refs_rehydrates_descriptor_slots(tmp_path, monkeypatch): +def test_generate_materialize_all_image_refs_rehydrates_descriptor_slots( + tmp_path, monkeypatch +): pytest.importorskip("PIL") from PIL import Image @@ -190,7 +199,10 @@ class _RetryRenderer(Qwen3VLRenderer): def get_stop_token_ids(self): return [99] - def parse_response(self, completion_ids, *, tools=None): + def parse_response( + self, token_ids: list[int], *, tools: list[ToolSpec] | None = None + ) -> ParsedResponse: + assert token_ids == [7, 8] return ParsedResponse(content="done") image_dir = tmp_path / "run_retry" / "assets" / "images" @@ -216,7 +228,9 @@ def parse_response(self, completion_ids, *, tools=None): messages=[ { "role": "user", - "content": [{"type": "image_url", "image_url": {"url": image_path.as_uri()}}], + "content": [ + {"type": "image_url", "image_url": {"url": image_path.as_uri()}} + ], } ], model="qwen3-vl", @@ -248,7 +262,7 @@ def test_generate_builds_request_body_and_parses_response(): renderer=renderer, messages=[{"role": "user", "content": "hi"}], model="test-model", - tools=[{"type": "function", "function": {"name": "echo"}}], + tools=[_OPENAI_TOOL], sampling_params={"temperature": 0.3, "max_tokens": 7, "min_tokens": 2}, cache_salt="ckpt-42", ) @@ -256,9 +270,7 @@ def test_generate_builds_request_body_and_parses_response(): # The client must plumb `tools` through to parse_response so XML-style # parsers can preserve declared-string args verbatim. - assert renderer._last_parse_tools == [ - {"type": "function", "function": {"name": "echo"}} - ] + assert renderer._last_parse_tools == [_OPENAI_TOOL] assert len(client.calls) == 1 # /inference/v1/generate is mounted at the server root, so we post to @@ -341,7 +353,7 @@ def test_generate_does_not_promote_finish_reason_for_malformed_tool_calls(): renderer=_MalformedToolRenderer(), messages=[{"role": "user", "content": "hi"}], model="test-model", - tools=[{"type": "function", "function": {"name": "echo"}}], + tools=[_OPENAI_TOOL], ) ) assert result["finish_reason"] == "stop" @@ -524,7 +536,14 @@ def parse_response(self, completion_ids, *, tools=None): assert items[1] is None ref = split_raw_mm_ref(items[0]) assert ref.payload == {"image_grid_thw": [[1, 2, 2]]} - assert (ref.run_id, ref.family, ref.fingerprint, ref.modality, ref.mm_hash, ref.raw_image_id) == ( + assert ( + ref.run_id, + ref.family, + ref.fingerprint, + ref.modality, + ref.mm_hash, + ref.raw_image_id, + ) == ( "rawtest", "qwen_vl", fingerprint, From b5167c938ce2d9f618f925fcf6d72f0979b0d56f Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 27 Jun 2026 00:18:40 +0000 Subject: [PATCH 04/16] Simplify v1 raw multimodal: carry the image pointer at every slot Drop the cache-only None path. Every image (current and prior turns) carries its raw descriptor ref; _descriptor_only_mm_data no longer strips the pointer, so refs carry forward without a rebuild. Removes the now-orphaned materialize_image_refs / materialize_kimi_image_refs and the materialize_all_image_refs flag. Co-Authored-By: Claude Opus 4.8 (1M context) --- renderers/base.py | 3 -- renderers/client.py | 36 ++-------------------- renderers/kimi_k25.py | 71 ------------------------------------------- renderers/qwen35.py | 5 --- renderers/qwen3_vl.py | 55 --------------------------------- tests/test_client.py | 67 ---------------------------------------- 6 files changed, 3 insertions(+), 234 deletions(-) diff --git a/renderers/base.py b/renderers/base.py index b11a328..e449f67 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -966,9 +966,6 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No with self.checkout() as r: return r.bridge_to_next_turn(*args, **kwargs) - def materialize_image_refs(self, *args: Any, **kwargs: Any) -> "MultiModalData": - with self.checkout() as r: - return r.materialize_image_refs(*args, **kwargs) # ``mm_token_type_id_map`` (the MultimodalRenderer protocol attribute) # is set in ``__init__`` only for pools wrapping multimodal renderers; diff --git a/renderers/client.py b/renderers/client.py index 25ab763..afc67f8 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -157,7 +157,6 @@ async def generate( priority: int | None = None, extra_headers: dict[str, str] | None = None, max_prompt_len: int | None = None, - materialize_all_image_refs: bool = False, ) -> dict[str, Any]: """Tokenize messages, call vLLM /inference/v1/generate, parse the response. @@ -257,17 +256,9 @@ def _features_and_descriptor_mm() -> tuple[ ]: if mm_data is None or mm_data.is_empty(): return None, mm_data - build_mm = mm_data - if materialize_all_image_refs: - materialize = getattr(renderer, "materialize_image_refs", None) - if materialize is None: - raise NotImplementedError( - f"{type(renderer).__name__} cannot materialize image refs for retry." - ) - build_mm = materialize(mm_data, messages) - return _build_vllm_mm_features(renderer, build_mm), _descriptor_only_mm_data( - mm_data - ) + # Every image carries its raw ref (the pointer); persisted mm_data keeps it + # so prior-turn images carry forward without a cache-only/None path. + return _build_vllm_mm_features(renderer, mm_data), mm_data features, out_mm_data = await _maybe_offload(renderer, _features_and_descriptor_mm) if ( @@ -356,27 +347,6 @@ def _features_and_descriptor_mm() -> tuple[ } -def _descriptor_only_mm_data(mm_data: MultiModalData) -> MultiModalData: - """Drop one-request image-ref fields before callers persist mm_data.""" - from renderers.mm_store import IMAGE_REF_PAYLOAD_KEY - - new_items: dict[str, list[dict[str, Any]]] = {} - for modality, items in mm_data.mm_items.items(): - new_items[modality] = [ - { - key: value - for key, value in item.items() - if key - not in { - "pixel_values", - "raw_uri", - "raw_image_id", - IMAGE_REF_PAYLOAD_KEY, - } - } - for item in items - ] - return replace(mm_data, mm_items=new_items) def _build_vllm_mm_features( diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index b2cc234..6c27e5a 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -567,73 +567,6 @@ def _kimi_grids_equal(a: Any, b: Any) -> bool: return al == bl -def materialize_kimi_image_refs( - renderer: Any, mm_data: MultiModalData, messages: list[Message] -) -> MultiModalData: - """Attach run-image refs to every Kimi image descriptor that can be found.""" - from dataclasses import replace - - image_items = mm_data.mm_items.get("image") or [] - if not image_items: - return mm_data - hashes = mm_data.mm_hashes.get("image") or [] - if len(hashes) != len(image_items): - raise ValueError( - "materialize_kimi_image_refs: mm_hashes/mm_items length mismatch " - f"({len(hashes)} vs {len(image_items)})" - ) - - missing = set(hashes) - resolved: dict[str, KimiImageLayoutDescriptor] = {} - for msg in messages or []: - content = msg.get("content") if isinstance(msg, dict) else None - if not isinstance(content, list): - continue - for part in content: - if not missing: - break - if not (isinstance(part, dict) and _is_image_part(part)): - continue - desc = describe_kimi_image_layout(renderer, part) - if desc.mm_hash in missing: - resolved[desc.mm_hash] = desc - missing.discard(desc.mm_hash) - if missing: - raise ValueError( - f"materialize_kimi_image_refs: {len(missing)} image hash(es) not found in messages" - ) - - new_image_items: list[dict[str, Any]] = [] - for i, item in enumerate(image_items): - desc = resolved[hashes[i]] - if desc.raw_uri is None or desc.raw_image_id is None: - raise ValueError( - "materialize_kimi_image_refs requires file-backed image URLs" - ) - item_grid = _kimi_grid_from_item(item) - if item_grid is not None and not _kimi_grids_equal(desc.grid_thws, item_grid): - raise ValueError( - "materialize_kimi_image_refs: reconstructed grid_thws " - f"{desc.grid_thws!r} != descriptor {item_grid!r}" - ) - new_image_items.append( - raw_mm_item( - modality="image", - family=KIMI_K25_FAMILY, - layout_fingerprint=desc.fingerprint, - payload={ - "grid_thws": item_grid if item_grid is not None else desc.grid_thws, - "num_media_tokens": desc.num_media_tokens, - }, - raw_uri=desc.raw_uri, - raw_image_id=desc.raw_image_id, - vllm_modality=KIMI_K25_VLLM_MODALITY, - ) - ) - - new_items = dict(mm_data.mm_items) - new_items["image"] = new_image_items - return replace(mm_data, mm_items=new_items) # --------------------------------------------------------------------------- @@ -882,10 +815,6 @@ def mm_token_type_id_map(self) -> dict[int, int]: internally from ``pixel_values``.""" return {self._media_pad: 1} - def materialize_image_refs( - self, mm_data: MultiModalData, messages: list[Message] - ) -> MultiModalData: - return materialize_kimi_image_refs(self, mm_data, messages) def _get_processor(self): if self._processor is not None: diff --git a/renderers/qwen35.py b/renderers/qwen35.py index c0b76d6..fd15393 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -37,7 +37,6 @@ from renderers.qwen3_vl import ( _is_image_part, _is_video_part, - materialize_image_refs, qwen_image_item_for_render, ) @@ -155,10 +154,6 @@ def mm_token_type_id_map(self) -> dict[int, int]: """ return {self._image_pad: 1, self._video_pad: 2} - def materialize_image_refs( - self, mm_data: MultiModalData, messages: list[Message] - ) -> MultiModalData: - return materialize_image_refs(self, mm_data, messages) @staticmethod def _content_has_media(content: Any) -> bool: diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 4e51b9e..4090802 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -414,57 +414,6 @@ def _qwen_item_with_grid_and_ref( return new_item -def materialize_image_refs( - renderer: Any, mm_data: MultiModalData, messages: list[Message] -) -> MultiModalData: - """Attach run-image refs to every Qwen image descriptor that can be found.""" - image_items = mm_data.mm_items.get("image") or [] - if not image_items: - return mm_data - hashes = mm_data.mm_hashes.get("image") or [] - if len(hashes) != len(image_items): - raise ValueError( - "materialize_image_refs: mm_hashes/mm_items length mismatch " - f"({len(hashes)} vs {len(image_items)})" - ) - - missing = set(hashes) - resolved: dict[str, QwenImageLayoutDescriptor] = {} - for part in _iter_image_parts(messages): - if not missing: - break - desc = describe_qwen_image_layout(renderer, part) - if desc.mm_hash in missing: - resolved[desc.mm_hash] = desc - missing.discard(desc.mm_hash) - if missing: - raise ValueError( - f"materialize_image_refs: {len(missing)} image hash(es) not found in messages" - ) - - new_image_items: list[dict[str, Any]] = [] - for i, item in enumerate(image_items): - desc = resolved[hashes[i]] - if desc.raw_uri is None or desc.raw_image_id is None: - raise ValueError("materialize_image_refs requires file-backed image URLs") - item_grid = _qwen_grid_from_item(item) - if item_grid is not None and not _grids_equal(desc.image_grid_thw, item_grid): - raise ValueError( - "materialize_image_refs: reconstructed image_grid_thw " - f"{desc.image_grid_thw!r} != descriptor {item_grid!r}" - ) - new_item = _qwen_item_with_grid_and_ref( - item, - image_grid_thw=item_grid if item_grid is not None else desc.image_grid_thw, - fingerprint=desc.fingerprint, - raw_uri=desc.raw_uri, - raw_image_id=desc.raw_image_id, - ) - new_image_items.append(new_item) - - new_items = dict(mm_data.mm_items) - new_items["image"] = new_image_items - return replace(mm_data, mm_items=new_items) class _Emitter: @@ -686,10 +635,6 @@ def _render_text_content(content: Any) -> str: return "".join(parts) raise TypeError(f"Unexpected content type: {type(content)}") - def materialize_image_refs( - self, mm_data: MultiModalData, messages: list[Message] - ) -> MultiModalData: - return materialize_image_refs(self, mm_data, messages) def render( self, diff --git a/tests/test_client.py b/tests/test_client.py index f14821e..3a2d56d 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -183,73 +183,6 @@ def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): assert rendered.multi_modal_data.mm_placeholders["image"][0].length == 64 -def test_generate_materialize_all_image_refs_rehydrates_descriptor_slots( - tmp_path, monkeypatch -): - pytest.importorskip("PIL") - from PIL import Image - - from renderers.base import MultiModalData, ParsedResponse, PlaceholderRange - from renderers.mm_store import split_raw_mm_ref - from renderers.qwen3_vl import Qwen3VLRenderer - - class _RetryRenderer(Qwen3VLRenderer): - supports_tools = True - - def get_stop_token_ids(self): - return [99] - - def parse_response( - self, token_ids: list[int], *, tools: list[ToolSpec] | None = None - ) -> ParsedResponse: - assert token_ids == [7, 8] - return ParsedResponse(content="done") - - image_dir = tmp_path / "run_retry" / "assets" / "images" - image_dir.mkdir(parents=True) - image_path = image_dir / "image.png" - Image.new("RGB", (32, 32), color=(0, 255, 0)).save(image_path) - monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) - monkeypatch.setenv("RUN_ID", "retry") - - mm_hash = hashlib.sha256(image_path.read_bytes()).hexdigest()[:32] - mm_data = MultiModalData( - mm_hashes={"image": [mm_hash]}, - mm_placeholders={"image": [PlaceholderRange(offset=5, length=64)]}, - mm_items={"image": [{"image_grid_thw": [[1, 16, 16]]}]}, - ) - renderer = _RetryRenderer(_TinyQwenTokenizer()) - client = _FakeClient() - - asyncio.run( - generate( - client=client, - renderer=renderer, - messages=[ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_path.as_uri()}} - ], - } - ], - model="qwen3-vl", - prompt_ids=list(range(20)), - multi_modal_data=mm_data, - sampling_params={"max_tokens": 4}, - materialize_all_image_refs=True, - ) - ) - - ref_item = client.calls[0]["body"]["features"]["kwargs_data"]["image"][0] - ref = split_raw_mm_ref(ref_item) - assert ref.payload["image_grid_thw"] == [[1, 16, 16]] - assert (ref.run_id, ref.modality, ref.mm_hash, ref.raw_image_id) == ( - "retry", - "image", - mm_hash, - "image.png", - ) def test_generate_builds_request_body_and_parses_response(): From b404f80c6f2939f8bf4aa1209e8e6d93241be1c4 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 27 Jun 2026 01:47:39 +0000 Subject: [PATCH 05/16] Renderers cleanup: drop dead render-time processor arg, backcompat, stale comments - Drop the render-time processor constructor arg from Qwen3VL/Qwen35/Kimi renderers: geometry is computed deterministically from config; no renderer runs the HF image processor at render. Remove Kimi dead _get_processor/_process_image/self._processor/_image_cache. - mm_store: remove all backcompat aliases (MMRAW_PREFIX, MM_RAW_PAYLOAD_KEY/VALUE, mmraw_ref, split_mmraw_ref, image_asset_dir) -- no consumers. - client.py: fix stale generate() docstring + comment that referenced the removed None/cache path. Co-Authored-By: Claude Opus 4.8 (1M context) --- renderers/client.py | 8 +++---- renderers/kimi_k25.py | 55 ------------------------------------------- renderers/mm_store.py | 10 -------- renderers/qwen35.py | 3 --- renderers/qwen3_vl.py | 6 ----- 5 files changed, 4 insertions(+), 78 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index afc67f8..17a90b2 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -175,8 +175,8 @@ async def generate( through ``renderer.render(...)`` to recover the ``multi_modal_data`` sidecar, then serializes it to vLLM's ``features`` schema (mm_hashes, mm_placeholders, kwargs_data) before POSTing. Raw image ``kwargs_data`` - slots are either ``None`` (cache lookup for a prior image) or descriptor - refs (new/current images that vLLM should process). + slots always carry a descriptor ref — every image (current and prior + turns) is sent as a pointer that the inference endpoint materializes. ``max_prompt_len`` controls the pre-flight overflow check. When the rendered prompt is strictly longer than the cap, the request is never @@ -256,8 +256,8 @@ def _features_and_descriptor_mm() -> tuple[ ]: if mm_data is None or mm_data.is_empty(): return None, mm_data - # Every image carries its raw ref (the pointer); persisted mm_data keeps it - # so prior-turn images carry forward without a cache-only/None path. + # Every image carries its raw ref (the pointer); persisted mm_data keeps it, + # so prior-turn images carry their ref forward unchanged. return _build_vllm_mm_features(renderer, mm_data), mm_data features, out_mm_data = await _maybe_offload(renderer, _features_and_descriptor_mm) diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 6c27e5a..c29473f 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -761,11 +761,8 @@ def __init__( self, tokenizer: PreTrainedTokenizer, config: KimiK25RendererConfig | None = None, - *, - processor: Any = None, ): self._tokenizer = tokenizer - self._processor = processor self.config = config or KimiK25RendererConfig() # Core structural tokens — all must be single special tokens in the vocab @@ -801,12 +798,6 @@ def __init__( # The stop token for generation self._endoftext: int | None = self._try_token_id("<|endoftext|>") - # Per-instance image-processor cache (FIFO-bounded). Same shape as - # ``Qwen3VLRenderer._image_cache`` — keyed by content hash, value is - # ``(processor_out, num_patches)``. ``num_patches`` is informational - # for Kimi (we emit a single placeholder regardless), but kept for - # consistency / debugging. - self._image_cache: dict[str, tuple[Any, int]] = {} @property def mm_token_type_id_map(self) -> dict[int, int]: @@ -816,53 +807,7 @@ def mm_token_type_id_map(self) -> dict[int, int]: return {self._media_pad: 1} - def _get_processor(self): - if self._processor is not None: - return self._processor - from transformers import AutoProcessor - name = getattr(self._tokenizer, "name_or_path", None) - if not name: - raise RuntimeError( - "KimiK25Renderer needs a processor to render image content. " - "Pass `processor=AutoProcessor.from_pretrained(name, trust_remote_code=True, " - "revision=)` to the constructor, or load the tokenizer with a " - "known name_or_path so the processor can be auto-loaded." - ) - # Kimi's processor is custom Python in the model repo and requires - # trust_remote_code=True. Callers using ``create_renderer_pool`` go - # through ``load_tokenizer`` which already pins the revision; for - # auto-load here, we delegate to AutoProcessor with the same flag. - self._processor = AutoProcessor.from_pretrained(name, trust_remote_code=True) - return self._processor - - def _process_image(self, part: dict[str, Any]): - """Resolve, process, and characterize a single image part for Kimi K2.5. - - Returns ``(pil, processor_out, num_patches, image_hash)`` where - ``processor_out`` contains ``pixel_values`` and ``grid_thws`` - (Kimi's keys; differ from Qwen-VL's ``image_grid_thw``). Single - ``<|media_pad|>`` per image in the token stream; the patch count - is informational only. - """ - pil = _load_pil_image(part) - h = _image_hash(pil) - cached = self._image_cache.get(h) - if cached is not None: - out, num_patches = cached - return pil, out, num_patches, h - proc = self._get_processor() - img_proc = proc.image_processor - # Kimi's vision processor takes a media-dict shape, not raw PIL. - media_item = {"type": "image", "image": pil} - out = img_proc.preprocess([media_item], return_tensors="np") - # Patch count via the processor's own calculator (matches the - # model's per-patch attention count); kept for debugging. - num_patches = int(img_proc.media_tokens_calculator(media_item)) - if len(self._image_cache) >= self.config.image_cache_max: - self._image_cache.pop(next(iter(self._image_cache))) - self._image_cache[h] = (out, num_patches) - return pil, out, num_patches, h # ------------------------------------------------------------------ # Token helpers diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 827851b..1355b96 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -116,9 +116,6 @@ def run_image_dir(run_id: str | None = None) -> Path: return (run_dir(run_id) / IMAGE_ASSET_SUBDIR).resolve() -def image_asset_dir(run_id: str | None = None) -> Path: - """Alias for callers that already use the assets terminology.""" - return run_image_dir(run_id) def _media_type_ext(media_type: str) -> str: @@ -306,10 +303,3 @@ def split_raw_mm_ref(ref: str) -> RawMMRef: payload=payload, ) - -# Backwards-compatible names for consumers that already speak the mmraw wire format. -MMRAW_PREFIX = IMAGE_REF_PREFIX -MM_RAW_PAYLOAD_KEY = IMAGE_REF_PAYLOAD_KEY -MM_RAW_PAYLOAD_VALUE = IMAGE_REF_PAYLOAD_VALUE -mmraw_ref = raw_mm_ref -split_mmraw_ref = split_raw_mm_ref diff --git a/renderers/qwen35.py b/renderers/qwen35.py index fd15393..906c6cb 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -115,11 +115,8 @@ def __init__( self, tokenizer: PreTrainedTokenizer, config: Qwen35RendererConfig | None = None, - *, - processor: Any = None, ): self._tokenizer = tokenizer - _ = processor cfg = config or type(self)._config_cls() # ``enable_thinking=None`` defers to the model's known default (see # ``_ENABLE_THINKING_DEFAULTS``). Materialise here so downstream reads diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 4090802..41ff581 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -549,9 +549,6 @@ class Qwen3VLRenderer: config: Typed renderer config (see :class:`renderers.Qwen3VLRendererConfig`). Defaults to a blank config with template defaults. - processor: Deprecated and ignored. Image layout is declared by the - renderer config; the renderer never loads or calls an HF image - processor. ``preserve_all_thinking`` / ``preserve_thinking_between_tool_calls`` on the config are no-ops here — the chat template drops past @@ -562,11 +559,8 @@ def __init__( self, tokenizer: PreTrainedTokenizer, config: Qwen3VLRendererConfig | None = None, - *, - processor: Any = None, ): self._tokenizer = tokenizer - _ = processor self.config = config or Qwen3VLRendererConfig() self._im_start = self._token_id("<|im_start|>") From f8ca35415a6e6d89a6cee6d9e9127429bb9d68b6 Mon Sep 17 00:00:00 2001 From: S1ro1 Date: Sat, 27 Jun 2026 01:54:51 +0000 Subject: [PATCH 06/16] Remove orphaned image_cache_max config field It only sized Kimi per-renderer image cache, which was deleted with the render-time processor path. No consumers. Co-Authored-By: Claude Opus 4.8 (1M context) --- renderers/configs.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/renderers/configs.py b/renderers/configs.py index 54ac342..af0644f 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -74,7 +74,7 @@ class BaseRendererConfig(BaseConfig): # Fields that are renderer-internal — not forwarded to (or mirrored # by) ``apply_chat_template``. Override in subclasses that hold - # non-template config (e.g. ``image_cache_max``, GptOss's + # non-template config (e.g. GptOss's # ``use_system_prompt`` / ``knowledge_cutoff`` / ``model_identity``, # or fields that exist as renderer conventions without a Jinja # analogue like DeepSeek V3 / Kimi K2 ``enable_thinking``). @@ -367,9 +367,6 @@ class KimiK25RendererConfig(BaseRendererConfig): ``thinking`` (not ``enable_thinking``) to match the upstream chat template's native variable name.""" - image_cache_max: int = 256 - """FIFO bound on Kimi's per-renderer image processor cache.""" - image_patch_size: int = KIMI_K25_IMAGE_PATCH_SIZE """Kimi MoonViT patch size used to compute raw image layout descriptors.""" @@ -393,7 +390,6 @@ class KimiK25RendererConfig(BaseRendererConfig): _internal_fields = frozenset( { - "image_cache_max", "image_patch_size", "image_merge_kernel_size", "image_in_patch_limit", From c33805db767780f5ea3c8268e442d93945405969 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Sat, 27 Jun 2026 22:48:28 +0000 Subject: [PATCH 07/16] Align raw multimodal renderer descriptors --- renderers/client.py | 59 ++++----- renderers/configs.py | 122 ------------------ renderers/image_layout_specs.py | 33 +++++ renderers/kimi_k25.py | 91 ++------------ renderers/mm_store.py | 104 ++++++++------- renderers/qwen35.py | 8 +- renderers/qwen3_vl.py | 124 ++---------------- tests/test_client.py | 28 ++++- tests/test_multimodal_image_layout_parity.py | 125 +++++++++++++++++++ 9 files changed, 278 insertions(+), 416 deletions(-) create mode 100644 renderers/image_layout_specs.py create mode 100644 tests/test_multimodal_image_layout_parity.py diff --git a/renderers/client.py b/renderers/client.py index 17a90b2..3465536 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -258,7 +258,7 @@ def _features_and_descriptor_mm() -> tuple[ return None, mm_data # Every image carries its raw ref (the pointer); persisted mm_data keeps it, # so prior-turn images carry their ref forward unchanged. - return _build_vllm_mm_features(renderer, mm_data), mm_data + return _build_vllm_mm_features(mm_data), mm_data features, out_mm_data = await _maybe_offload(renderer, _features_and_descriptor_mm) if ( @@ -349,22 +349,16 @@ def _features_and_descriptor_mm() -> tuple[ -def _build_vllm_mm_features( - renderer: Renderer | RendererPool, - mm_data: MultiModalData, -) -> dict[str, Any] | None: +def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: """Serialize ``MultiModalData`` to vLLM's ``/inference/v1/generate`` features payload. - vLLM's ``MultiModalFeatures`` carries three things: hashes (for cache - lookup), placeholder positions (so the engine knows where in the token - stream each item lives), and per-item payload selectors. Raw multimodal - descriptors use the common envelope emitted by renderers; family-specific - geometry stays inside the descriptor payload and is interpreted downstream - by prime-rl/vLLM adapters. + vLLM's ``MultiModalFeatures`` carries three things: hashes, placeholder + positions (so the engine knows where in the token stream each item lives), + and one raw ref per item. Raw multimodal descriptors use the common envelope + emitted by renderers; family-specific geometry stays inside the descriptor + payload and is interpreted downstream by prime-rl/vLLM adapters. """ from renderers.mm_store import ( - IMAGE_REF_PAYLOAD_KEY, - IMAGE_REF_PAYLOAD_VALUE, RAW_MM_ITEM_KIND, current_run_id, raw_mm_ref, @@ -398,35 +392,34 @@ def _build_vllm_mm_features( feature_modality = item.get("vllm_modality") or source_modality if not isinstance(feature_modality, str) or not feature_modality: raise ValueError("raw multimodal item has invalid vllm_modality") - out["mm_hashes"].setdefault(feature_modality, []).append(mm_hashes[idx]) - out["mm_placeholders"].setdefault(feature_modality, []).append( - {"offset": placeholders[idx].offset, "length": placeholders[idx].length} - ) - out["kwargs_data"].setdefault(feature_modality, []).append(None) - if item.get(IMAGE_REF_PAYLOAD_KEY) != IMAGE_REF_PAYLOAD_VALUE: - continue + raw_image_id = item.get("raw_image_id") family = item.get("family") fingerprint = item.get("layout_fingerprint") + payload = item.get("payload") if not isinstance(raw_image_id, str) or not raw_image_id: raise ValueError("raw multimodal item is missing raw_image_id") if not isinstance(family, str) or not family: raise ValueError("raw multimodal item is missing family") if not isinstance(fingerprint, str) or not fingerprint: raise ValueError("raw multimodal item is missing layout_fingerprint") - out["kwargs_data"][feature_modality][-1] = raw_mm_ref( - run_id=run_id, - family=family, - fingerprint=fingerprint, - modality=feature_modality, - mm_hash=mm_hashes[idx], - raw_image_id=raw_image_id, - payload=item.get("payload") or {}, - ) + if not isinstance(payload, dict): + raise ValueError("raw multimodal item payload must be a dict") - if not any( - item is not None for values in out["kwargs_data"].values() for item in values - ): - out["kwargs_data"] = None + out["mm_hashes"].setdefault(feature_modality, []).append(mm_hashes[idx]) + out["mm_placeholders"].setdefault(feature_modality, []).append( + {"offset": placeholders[idx].offset, "length": placeholders[idx].length} + ) + out["kwargs_data"].setdefault(feature_modality, []).append( + raw_mm_ref( + run_id=run_id, + family=family, + fingerprint=fingerprint, + modality=feature_modality, + mm_hash=mm_hashes[idx], + raw_image_id=raw_image_id, + payload=payload, + ) + ) return out diff --git a/renderers/configs.py b/renderers/configs.py index af0644f..f51cb81 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -25,20 +25,6 @@ from pydantic import ConfigDict, Field from pydantic_config import BaseConfig -QWEN_VL_IMAGE_PATCH_SIZE = 16 -QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE = 2 -QWEN_VL_IMAGE_MERGE_SIZE = 2 -QWEN_VL_IMAGE_MIN_PIXELS = 65536 -QWEN_VL_IMAGE_MAX_PIXELS = 16777216 - -KIMI_K25_IMAGE_PATCH_SIZE = 14 -KIMI_K25_IMAGE_MERGE_KERNEL_SIZE = 2 -KIMI_K25_IMAGE_IN_PATCH_LIMIT = 16384 -KIMI_K25_IMAGE_PATCH_LIMIT_ON_ONE_SIDE = 512 -KIMI_K25_IMAGE_FIXED_OUTPUT_TOKENS: int | None = None -KIMI_K25_IMAGE_MEAN = (0.5, 0.5, 0.5) -KIMI_K25_IMAGE_STD = (0.5, 0.5, 0.5) - class BaseRendererConfig(BaseConfig): """Shared fields and config for every renderer config variant. @@ -162,31 +148,6 @@ class Qwen35RendererConfig(BaseRendererConfig): running across the entire conversation. Mirrors the chat template's ``add_vision_id`` toggle.""" - image_patch_size: int = QWEN_VL_IMAGE_PATCH_SIZE - """Qwen image patch size used to compute placeholder layout.""" - - image_temporal_patch_size: int = QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE - """Qwen temporal patch size used in the image layout fingerprint.""" - - image_merge_size: int = QWEN_VL_IMAGE_MERGE_SIZE - """Qwen spatial merge size used to compute image pad-token counts.""" - - image_min_pixels: int = QWEN_VL_IMAGE_MIN_PIXELS - """Minimum resized image area used by Qwen smart-resize layout math.""" - - image_max_pixels: int = QWEN_VL_IMAGE_MAX_PIXELS - """Maximum resized image area used by Qwen smart-resize layout math.""" - - _internal_fields = frozenset( - { - "image_patch_size", - "image_temporal_patch_size", - "image_merge_size", - "image_min_pixels", - "image_max_pixels", - } - ) - class Qwen36RendererConfig(BaseRendererConfig): """Qwen3.6 renderer config. Inherits Qwen3.5's template surface.""" @@ -199,31 +160,6 @@ class Qwen36RendererConfig(BaseRendererConfig): add_vision_id: bool = False """See :class:`Qwen35RendererConfig.add_vision_id`.""" - image_patch_size: int = QWEN_VL_IMAGE_PATCH_SIZE - """See :class:`Qwen35RendererConfig.image_patch_size`.""" - - image_temporal_patch_size: int = QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE - """See :class:`Qwen35RendererConfig.image_temporal_patch_size`.""" - - image_merge_size: int = QWEN_VL_IMAGE_MERGE_SIZE - """See :class:`Qwen35RendererConfig.image_merge_size`.""" - - image_min_pixels: int = QWEN_VL_IMAGE_MIN_PIXELS - """See :class:`Qwen35RendererConfig.image_min_pixels`.""" - - image_max_pixels: int = QWEN_VL_IMAGE_MAX_PIXELS - """See :class:`Qwen35RendererConfig.image_max_pixels`.""" - - _internal_fields = frozenset( - { - "image_patch_size", - "image_temporal_patch_size", - "image_merge_size", - "image_min_pixels", - "image_max_pixels", - } - ) - class Qwen3VLRendererConfig(BaseRendererConfig): """Qwen3-VL renderer config.""" @@ -233,31 +169,6 @@ class Qwen3VLRendererConfig(BaseRendererConfig): add_vision_id: bool = False """See :class:`Qwen35RendererConfig.add_vision_id`.""" - image_patch_size: int = QWEN_VL_IMAGE_PATCH_SIZE - """See :class:`Qwen35RendererConfig.image_patch_size`.""" - - image_temporal_patch_size: int = QWEN_VL_IMAGE_TEMPORAL_PATCH_SIZE - """See :class:`Qwen35RendererConfig.image_temporal_patch_size`.""" - - image_merge_size: int = QWEN_VL_IMAGE_MERGE_SIZE - """See :class:`Qwen35RendererConfig.image_merge_size`.""" - - image_min_pixels: int = QWEN_VL_IMAGE_MIN_PIXELS - """See :class:`Qwen35RendererConfig.image_min_pixels`.""" - - image_max_pixels: int = QWEN_VL_IMAGE_MAX_PIXELS - """See :class:`Qwen35RendererConfig.image_max_pixels`.""" - - _internal_fields = frozenset( - { - "image_patch_size", - "image_temporal_patch_size", - "image_merge_size", - "image_min_pixels", - "image_max_pixels", - } - ) - class GLM5RendererConfig(BaseRendererConfig): """GLM-5 renderer config.""" @@ -367,39 +278,6 @@ class KimiK25RendererConfig(BaseRendererConfig): ``thinking`` (not ``enable_thinking``) to match the upstream chat template's native variable name.""" - image_patch_size: int = KIMI_K25_IMAGE_PATCH_SIZE - """Kimi MoonViT patch size used to compute raw image layout descriptors.""" - - image_merge_kernel_size: int = KIMI_K25_IMAGE_MERGE_KERNEL_SIZE - """Kimi spatial merge kernel used to compute output media-token layout.""" - - image_in_patch_limit: int = KIMI_K25_IMAGE_IN_PATCH_LIMIT - """Kimi NavIT input patch budget used by image resize layout math.""" - - image_patch_limit_on_one_side: int = KIMI_K25_IMAGE_PATCH_LIMIT_ON_ONE_SIDE - """Kimi per-side patch cap used by image resize layout math.""" - - image_fixed_output_tokens: int | None = KIMI_K25_IMAGE_FIXED_OUTPUT_TOKENS - """Optional fixed Kimi output token count. Current K2.5/K2.6 configs use ``None``.""" - - image_mean: tuple[float, float, float] = KIMI_K25_IMAGE_MEAN - """Kimi image normalization mean, included in processor fingerprints.""" - - image_std: tuple[float, float, float] = KIMI_K25_IMAGE_STD - """Kimi image normalization std, included in processor fingerprints.""" - - _internal_fields = frozenset( - { - "image_patch_size", - "image_merge_kernel_size", - "image_in_patch_limit", - "image_patch_limit_on_one_side", - "image_fixed_output_tokens", - "image_mean", - "image_std", - } - ) - class LagunaXS2RendererConfig(BaseRendererConfig): """Laguna XS.2 renderer config.""" diff --git a/renderers/image_layout_specs.py b/renderers/image_layout_specs.py new file mode 100644 index 0000000..c233eea --- /dev/null +++ b/renderers/image_layout_specs.py @@ -0,0 +1,33 @@ +"""Static multimodal image layout contracts mirrored from model processors.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class QwenVLImageLayoutSpec: + """Qwen-VL image processor values needed for raw descriptor layout math.""" + + patch_size: int = 16 + temporal_patch_size: int = 2 + merge_size: int = 2 + min_pixels: int = 65536 + max_pixels: int = 16777216 + + +@dataclass(frozen=True) +class KimiK25ImageLayoutSpec: + """Kimi K2.5 image processor values needed for raw descriptor layout math.""" + + patch_size: int = 14 + merge_kernel_size: int = 2 + in_patch_limit: int = 16384 + patch_limit_on_one_side: int = 512 + fixed_output_tokens: int | None = None + image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5) + image_std: tuple[float, float, float] = (0.5, 0.5, 0.5) + + +QWEN_VL_IMAGE_LAYOUT = QwenVLImageLayoutSpec() +KIMI_K25_IMAGE_LAYOUT = KimiK25ImageLayoutSpec() diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index c29473f..10c4978 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -25,7 +25,7 @@ import math import re from dataclasses import dataclass -from typing import Any, cast +from typing import Any from transformers.tokenization_utils import PreTrainedTokenizer @@ -44,15 +44,14 @@ trim_to_turn_close, ) from renderers.configs import KimiK25RendererConfig +from renderers.image_layout_specs import KIMI_K25_IMAGE_LAYOUT, KimiK25ImageLayoutSpec from renderers.parsing import _reasoning_end_token_index, parse_kimi_k2_section from renderers.qwen3_vl import ( _image_content_hash, _image_dimensions, - _image_hash, _image_source, _is_image_part, _is_video_part, - _load_pil_image, _raw_uri_and_id, ) from renderers.mm_store import image_layout_fingerprint, raw_mm_item @@ -411,17 +410,6 @@ def _encode_tools_typescript(tools: list[ToolSpec]) -> str: return "# Tools\n\n## functions\nnamespace functions {\n" + functions_str + "\n}\n" -@dataclass(frozen=True) -class KimiImageLayoutConfig: - patch_size: int - merge_kernel_size: int - in_patch_limit: int - patch_limit_on_one_side: int - fixed_output_tokens: int | None - image_mean: tuple[float, ...] - image_std: tuple[float, ...] - - @dataclass(frozen=True) class KimiImageLayoutDescriptor: mm_hash: str @@ -432,52 +420,12 @@ class KimiImageLayoutDescriptor: raw_image_id: str | None = None -def kimi_image_layout_config_for_renderer(renderer: Any) -> KimiImageLayoutConfig: - config = renderer.config - values = { - "patch_size": getattr(config, "image_patch_size", None), - "merge_kernel_size": getattr(config, "image_merge_kernel_size", None), - "in_patch_limit": getattr(config, "image_in_patch_limit", None), - "patch_limit_on_one_side": getattr( - config, "image_patch_limit_on_one_side", None - ), - "fixed_output_tokens": getattr(config, "image_fixed_output_tokens", None), - "image_mean": getattr(config, "image_mean", None), - "image_std": getattr(config, "image_std", None), - } - missing = [ - name - for name, value in values.items() - if value is None and name != "fixed_output_tokens" - ] - if missing: - raise RuntimeError( - "Kimi image layout must be declared on the renderer config; missing " - + ", ".join(missing) - ) - image_mean = cast("tuple[float, ...] | list[float]", values["image_mean"]) - image_std = cast("tuple[float, ...] | list[float]", values["image_std"]) - return KimiImageLayoutConfig( - patch_size=int(values["patch_size"]), - merge_kernel_size=int(values["merge_kernel_size"]), - in_patch_limit=int(values["in_patch_limit"]), - patch_limit_on_one_side=int(values["patch_limit_on_one_side"]), - fixed_output_tokens=( - None - if values["fixed_output_tokens"] is None - else int(values["fixed_output_tokens"]) - ), - image_mean=tuple(float(v) for v in image_mean), - image_std=tuple(float(v) for v in image_std), - ) - - def _ceil_to_factor(value: int, factor: int) -> int: return max(factor, math.ceil(value / factor) * factor) def _kimi_resize_config( - width: int, height: int, layout: KimiImageLayoutConfig + width: int, height: int, layout: KimiK25ImageLayoutSpec ) -> tuple[int, int, int]: """Kimi MoonViT/NavIT image resize layout without materializing pixels.""" if height <= 0 or width <= 0: @@ -504,12 +452,10 @@ def _kimi_resize_config( return padded_w, padded_h, int(num_tokens) -def describe_kimi_image_layout( - renderer: Any, part: dict[str, Any] -) -> KimiImageLayoutDescriptor: +def describe_kimi_image_layout(part: dict[str, Any]) -> KimiImageLayoutDescriptor: source = _image_source(part) height, width = _image_dimensions(source) - layout = kimi_image_layout_config_for_renderer(renderer) + layout = KIMI_K25_IMAGE_LAYOUT padded_w, padded_h, num_media_tokens = _kimi_resize_config(width, height, layout) grid_thws = [[1, padded_h // layout.patch_size, padded_w // layout.patch_size]] fingerprint = image_layout_fingerprint( @@ -533,10 +479,8 @@ def describe_kimi_image_layout( ) -def kimi_image_item_for_render( - renderer: Any, part: dict[str, Any] -) -> tuple[int, str, dict[str, Any]]: - desc = describe_kimi_image_layout(renderer, part) +def kimi_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: + desc = describe_kimi_image_layout(part) item = raw_mm_item( modality="image", family=KIMI_K25_FAMILY, @@ -552,23 +496,6 @@ def kimi_image_item_for_render( return 1, desc.mm_hash, item -def _kimi_grid_from_item(item: dict[str, Any]) -> Any: - payload = item.get("payload") - if isinstance(payload, dict) and payload.get("grid_thws") is not None: - return payload["grid_thws"] - return item.get("grid_thws") - - -def _kimi_grids_equal(a: Any, b: Any) -> bool: - if a is None or b is None: - return False - al = a.tolist() if hasattr(a, "tolist") else a - bl = b.tolist() if hasattr(b, "tolist") else b - return al == bl - - - - # --------------------------------------------------------------------------- # Kimi K2.5 response parsing (mirrors K2 format, same token structure) # --------------------------------------------------------------------------- @@ -929,7 +856,7 @@ def emit_image( ``<|media_content|>``, ``<|media_end|>``, the trailing ``\\n``) are template-injected scaffold. """ - _placeholder_len, h, mm_item = kimi_image_item_for_render(self, part) + _placeholder_len, h, mm_item = kimi_image_item_for_render(part) emit_special( self._media_begin, msg_idx, is_sampled=is_sampled, is_content=False ) @@ -1215,7 +1142,7 @@ def emit_image( is_sampled: bool = False, is_content: bool = False, ) -> None: - _placeholder_len, h, mm_item = kimi_image_item_for_render(self, part) + _placeholder_len, h, mm_item = kimi_image_item_for_render(part) emit_special(self._media_begin, msg_idx) emit_text("image", msg_idx) emit_special(self._media_content, msg_idx) diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 1355b96..8f08eca 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -30,13 +30,15 @@ RAW_MM_ITEM_KIND = "prime_raw_mm_item" RAW_MM_ITEM_VERSION = 1 -_SAFE_RUN_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") -_SAFE_FAMILY_RE = re.compile(r"^[A-Za-z0-9_.-]+$") -_SAFE_MODALITY_RE = re.compile(r"^[A-Za-z0-9_.-]+$") -_SAFE_FINGERPRINT_RE = re.compile(r"^[a-f0-9]{16,64}$") -_SAFE_MM_HASH_RE = re.compile(r"^[a-f0-9]{16,128}$") -_SAFE_IMAGE_ID_RE = re.compile(r"^[A-Za-z0-9_.-]+$") -_SAFE_REF_PAYLOAD_RE = re.compile(r"^[A-Za-z0-9_-]*$") +_SAFE = { + "run id": re.compile(r"^[A-Za-z0-9_.-]+$"), + "multimodal family": re.compile(r"^[A-Za-z0-9_.-]+$"), + "raw multimodal modality": re.compile(r"^[A-Za-z0-9_.-]+$"), + "image layout fingerprint": re.compile(r"^[a-f0-9]{16,64}$"), + "image hash": re.compile(r"^[a-f0-9]{16,128}$"), + "raw image id": re.compile(r"^[A-Za-z0-9_.-]+$"), + "raw multimodal ref payload segment": re.compile(r"^[A-Za-z0-9_-]*$"), +} _MEDIA_TYPE_EXT = { "jpeg": ".jpg", @@ -47,14 +49,20 @@ } +def _ensure_safe(label: str, value: str) -> str: + if not _SAFE[label].fullmatch(value): + raise ValueError(f"Invalid {label}: {value!r}") + return value + + def normalize_run_id(run_id: str) -> str: """Return the canonical run id, without the directory's ``run_`` prefix.""" value = run_id.strip() if value.startswith("run_"): value = value[len("run_") :] - if not value or not _SAFE_RUN_ID_RE.fullmatch(value): + if not value: raise ValueError(f"Invalid run id: {run_id!r}") - return value + return _ensure_safe("run id", value) def run_dir_name(run_id: str) -> str: @@ -116,8 +124,6 @@ def run_image_dir(run_id: str | None = None) -> Path: return (run_dir(run_id) / IMAGE_ASSET_SUBDIR).resolve() - - def _media_type_ext(media_type: str) -> str: subtype = media_type.split("/", 1)[-1].split(";", 1)[0].strip().lower() return _MEDIA_TYPE_EXT.get(subtype, ".img") @@ -160,8 +166,7 @@ def offload_image_to_run_assets( def raw_image_path(*, run_id: str, raw_image_id: str) -> Path: - if not _SAFE_IMAGE_ID_RE.fullmatch(raw_image_id): - raise ValueError(f"Invalid raw image id: {raw_image_id!r}") + _ensure_safe("raw image id", raw_image_id) root = run_image_dir(run_id) path = (root / raw_image_id).resolve() if not path.is_relative_to(root): @@ -173,10 +178,27 @@ def _json_fingerprint_value(value: object) -> str: return json.dumps(value, sort_keys=True, separators=(",", ":"), default=str) +def _encode_ref_payload(payload: dict[str, object] | None) -> str: + raw = json.dumps(payload or {}, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + return base64.urlsafe_b64encode(raw).decode("ascii").rstrip("=") + + +def _decode_ref_payload(encoded: str) -> dict[str, object]: + _ensure_safe("raw multimodal ref payload segment", encoded) + padded = encoded + "=" * (-len(encoded) % 4) + payload = json.loads( + base64.urlsafe_b64decode(padded.encode("ascii")).decode("utf-8") + ) + if not isinstance(payload, dict): + raise ValueError("Raw multimodal ref payload must decode to a dict") + return payload + + def image_layout_fingerprint(*, family: str, **values: object) -> str: """Stable adapter-owned fingerprint for raw multimodal layout contracts.""" - if not _SAFE_FAMILY_RE.fullmatch(family): - raise ValueError(f"Invalid multimodal family: {family!r}") + _ensure_safe("multimodal family", family) encoded_values = ":".join( f"{key}={_json_fingerprint_value(values[key])}" for key in sorted(values) ) @@ -200,12 +222,9 @@ def raw_mm_item( ``family`` and validate the common envelope, but must not inspect adapter payload keys. """ - if not _SAFE_FAMILY_RE.fullmatch(family): - raise ValueError(f"Invalid multimodal family: {family!r}") - if not _SAFE_MODALITY_RE.fullmatch(modality): - raise ValueError(f"Invalid raw multimodal modality: {modality!r}") - if not _SAFE_FINGERPRINT_RE.fullmatch(layout_fingerprint): - raise ValueError(f"Invalid image layout fingerprint: {layout_fingerprint!r}") + _ensure_safe("multimodal family", family) + _ensure_safe("raw multimodal modality", modality) + _ensure_safe("image layout fingerprint", layout_fingerprint) out: dict[str, object] = { "kind": RAW_MM_ITEM_KIND, "version": RAW_MM_ITEM_VERSION, @@ -254,24 +273,12 @@ def raw_mm_ref( future families without baking shape names into the wire id. """ run_id = normalize_run_id(run_id) - if not _SAFE_FAMILY_RE.fullmatch(family): - raise ValueError(f"Invalid multimodal family: {family!r}") - if not _SAFE_FINGERPRINT_RE.fullmatch(fingerprint): - raise ValueError(f"Invalid image layout fingerprint: {fingerprint!r}") - if not _SAFE_MODALITY_RE.fullmatch(modality): - raise ValueError(f"Invalid raw multimodal modality: {modality!r}") - if not _SAFE_MM_HASH_RE.fullmatch(mm_hash): - raise ValueError(f"Invalid image hash: {mm_hash!r}") + _ensure_safe("multimodal family", family) + _ensure_safe("image layout fingerprint", fingerprint) + _ensure_safe("raw multimodal modality", modality) + _ensure_safe("image hash", mm_hash) raw_image_path(run_id=run_id, raw_image_id=raw_image_id) - encoded_payload = ( - base64.urlsafe_b64encode( - json.dumps(payload or {}, sort_keys=True, separators=(",", ":")).encode( - "utf-8" - ) - ) - .decode("ascii") - .rstrip("=") - ) + encoded_payload = _encode_ref_payload(payload) return ( f"{IMAGE_REF_PREFIX}:{run_id}:{family}:{fingerprint}:" f"{modality}:{mm_hash}:{raw_image_id}:{encoded_payload}" @@ -285,21 +292,12 @@ def split_raw_mm_ref(ref: str) -> RawMMRef: run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = ( parts[2:] ) - if not _SAFE_REF_PAYLOAD_RE.fullmatch(encoded_payload): - raise ValueError("Invalid raw multimodal ref payload segment") - padded = encoded_payload + "=" * (-len(encoded_payload) % 4) - payload = json.loads( - base64.urlsafe_b64decode(padded.encode("ascii")).decode("utf-8") - ) - if not isinstance(payload, dict): - raise ValueError("Raw multimodal ref payload must decode to a dict") return RawMMRef( run_id=normalize_run_id(run_id), - family=family, - fingerprint=fingerprint, - modality=modality, - mm_hash=mm_hash, - raw_image_id=raw_image_id, - payload=payload, + family=_ensure_safe("multimodal family", family), + fingerprint=_ensure_safe("image layout fingerprint", fingerprint), + modality=_ensure_safe("raw multimodal modality", modality), + mm_hash=_ensure_safe("image hash", mm_hash), + raw_image_id=_ensure_safe("raw image id", raw_image_id), + payload=_decode_ref_payload(encoded_payload), ) - diff --git a/renderers/qwen35.py b/renderers/qwen35.py index 906c6cb..adc8efd 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -7,8 +7,8 @@ processor class ``Qwen3VLProcessor``). When a user/tool message carries an ``ImagePart``, the renderer emits the same ``<|vision_start|>``+N×``<|image_pad|>`` +``<|vision_end|>`` expansion as the HF chat template (``N = -image_grid_thw.prod() // merge_size**2``) using renderer-declared image layout -metadata. It does not call the HF image processor; vLLM receives run image refs +image_grid_thw.prod() // merge_size**2``) using the renderer's baked image +layout spec. It does not call the HF image processor; vLLM receives run image refs for images it must process. """ @@ -316,7 +316,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None: # image data, so they ARE body content (is_content=True); # the surrounding ``<|vision_start|>`` / ``<|vision_end|>`` # specials are template scaffold. - n, h, mm_item = qwen_image_item_for_render(self, part) + n, h, mm_item = qwen_image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: emit_text( @@ -662,7 +662,7 @@ def emit_text_segments( content_mask.append(is_content) def emit_image(part: dict[str, Any], msg_idx: int = -1) -> None: - n, h, mm_item = qwen_image_item_for_render(self, part) + n, h, mm_item = qwen_image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: emit_text(f"Picture {vision_counts['image']}: ", msg_idx) diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 41ff581..5843428 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -28,7 +28,7 @@ import io import json import math -from dataclasses import dataclass, replace +from dataclasses import dataclass from pathlib import Path from typing import Any from urllib.parse import unquote, urlparse @@ -48,9 +48,8 @@ trim_to_turn_close, ) from renderers.configs import Qwen3VLRendererConfig +from renderers.image_layout_specs import QWEN_VL_IMAGE_LAYOUT from renderers.mm_store import ( - IMAGE_REF_PAYLOAD_KEY, - IMAGE_REF_PAYLOAD_VALUE, image_layout_fingerprint, raw_mm_item, ) @@ -169,15 +168,6 @@ def _image_hash(pil_image) -> str: return h.hexdigest()[:32] -@dataclass(frozen=True) -class QwenImageLayoutConfig: - patch_size: int - temporal_patch_size: int - merge_size: int - min_pixels: int - max_pixels: int - - @dataclass(frozen=True) class QwenImageLayoutDescriptor: mm_hash: str @@ -188,30 +178,6 @@ class QwenImageLayoutDescriptor: raw_image_id: str | None = None -def qwen_image_layout_config_for_renderer(renderer: Any) -> QwenImageLayoutConfig: - config = renderer.config - values = { - "patch_size": getattr(config, "image_patch_size", None), - "temporal_patch_size": getattr(config, "image_temporal_patch_size", None), - "merge_size": getattr(config, "image_merge_size", None), - "min_pixels": getattr(config, "image_min_pixels", None), - "max_pixels": getattr(config, "image_max_pixels", None), - } - missing = [name for name, value in values.items() if value is None] - if missing: - raise RuntimeError( - "Qwen image layout must be declared on the renderer config; missing " - + ", ".join(missing) - ) - return QwenImageLayoutConfig( - patch_size=int(values["patch_size"]), - temporal_patch_size=int(values["temporal_patch_size"]), - merge_size=int(values["merge_size"]), - min_pixels=int(values["min_pixels"]), - max_pixels=int(values["max_pixels"]), - ) - - def _smart_resize( height: int, width: int, @@ -292,13 +258,11 @@ def _raw_uri_and_id(source: Any) -> tuple[str | None, str | None]: return path.as_uri(), path.name -def describe_qwen_image_layout( - renderer: Any, part: dict[str, Any] -) -> QwenImageLayoutDescriptor: +def describe_qwen_image_layout(part: dict[str, Any]) -> QwenImageLayoutDescriptor: """Return Qwen image layout metadata without invoking an image processor.""" source = _image_source(part) height, width = _image_dimensions(source) - layout = qwen_image_layout_config_for_renderer(renderer) + layout = QWEN_VL_IMAGE_LAYOUT resized_h, resized_w = _smart_resize( height, width, @@ -331,10 +295,8 @@ def describe_qwen_image_layout( ) -def qwen_image_item_for_render( - renderer: Any, part: dict[str, Any] -) -> tuple[int, str, dict[str, Any]]: - desc = describe_qwen_image_layout(renderer, part) +def qwen_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str, Any]]: + desc = describe_qwen_image_layout(part) item = raw_mm_item( modality="image", family="qwen_vl", @@ -346,76 +308,6 @@ def qwen_image_item_for_render( return desc.num_image_tokens, desc.mm_hash, item -def _iter_image_parts(messages: list[Any]): - for msg in messages or []: - content = msg.get("content") if isinstance(msg, dict) else None - if not isinstance(content, list): - continue - for item in content: - if isinstance(item, dict) and _is_image_part(item): - yield item - - -def _grids_equal(a: Any, b: Any) -> bool: - if a is None or b is None: - return False - al = a.tolist() if hasattr(a, "tolist") else list(a) - bl = b.tolist() if hasattr(b, "tolist") else list(b) - return al == bl - - -def _qwen_grid_from_item(item: dict[str, Any]) -> Any: - payload = item.get("payload") - if isinstance(payload, dict) and payload.get("image_grid_thw") is not None: - return payload["image_grid_thw"] - return item.get("image_grid_thw") - - -def _qwen_item_with_grid_and_ref( - item: dict[str, Any], - *, - image_grid_thw: Any, - fingerprint: str, - raw_uri: str, - raw_image_id: str, -) -> dict[str, Any]: - new_item = { - k: v - for k, v in item.items() - if k - not in { - "raw_uri", - "raw_image_id", - "image_layout_fingerprint", - IMAGE_REF_PAYLOAD_KEY, - } - } - if new_item.get("family") == "qwen_vl" and isinstance( - new_item.get("payload"), dict - ): - payload = dict(new_item["payload"]) - payload["image_grid_thw"] = image_grid_thw - new_item["payload"] = payload - new_item["layout_fingerprint"] = fingerprint - else: - new_item = raw_mm_item( - modality="image", - family="qwen_vl", - layout_fingerprint=fingerprint, - payload={"image_grid_thw": image_grid_thw}, - ) - new_item.update( - { - "raw_uri": raw_uri, - "raw_image_id": raw_image_id, - IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, - } - ) - return new_item - - - - class _Emitter: """Token-stream builder with BPE-safe text buffering. @@ -659,7 +551,7 @@ def emit_image(part: dict[str, Any]) -> None: # image data, so they ARE body content (is_content=True); # the surrounding ``<|vision_start|>`` / ``<|vision_end|>`` # markers are renderer-emitted scaffold. - n, h, mm_item = qwen_image_item_for_render(self, part) + n, h, mm_item = qwen_image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: em.text( @@ -920,7 +812,7 @@ def bridge_to_next_turn( vision_counts = {"image": prev_image_count, "video": prev_video_count} def emit_image(part: dict[str, Any]) -> None: - n, h, mm_item = qwen_image_item_for_render(self, part) + n, h, mm_item = qwen_image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: em.text( diff --git a/tests/test_client.py b/tests/test_client.py index 3a2d56d..18dfa25 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,6 +1,5 @@ import asyncio import base64 -import hashlib import json from typing import Any @@ -11,7 +10,6 @@ ParsedResponse, ParsedToolCall, RenderedTokens, - ToolSpec, ToolCallParseStatus, ) from renderers.client import generate @@ -375,8 +373,8 @@ def test_generate_serializes_image_refs_for_qwen_vl_family( ): """When the renderer emits ``MultiModalData``, ``generate`` translates it into vLLM's ``features`` payload (mm_hashes + mm_placeholders + - image-ref kwargs_data) and sticks it in the request body. Descriptor-only - images stay ``None`` so vLLM can resolve them from its cache.""" + image-ref kwargs_data) and sticks it in the request body. Every image slot + carries a lightweight raw ref.""" import importlib from renderers.base import ( @@ -406,6 +404,7 @@ def parse_response(self, completion_ids, *, tools=None): image_dir = tmp_path / "run_rawtest" / "assets" / "images" image_dir.mkdir(parents=True) (image_dir / "image.png").write_bytes(b"image-bytes") + (image_dir / "image2.png").write_bytes(b"other-image-bytes") monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) monkeypatch.setenv("RUN_ID", "rawtest") fingerprint = image_layout_fingerprint( @@ -440,6 +439,8 @@ def parse_response(self, completion_ids, *, tools=None): family="qwen_vl", layout_fingerprint=fingerprint, payload={"image_grid_thw": [[1, 2, 2]]}, + raw_uri=(image_dir / "image2.png").as_uri(), + raw_image_id="image2.png", ), ], }, @@ -466,7 +467,6 @@ def parse_response(self, completion_ids, *, tools=None): "image": [{"offset": 5, "length": 1}, {"offset": 10, "length": 1}], } items = features["kwargs_data"]["image"] - assert items[1] is None ref = split_raw_mm_ref(items[0]) assert ref.payload == {"image_grid_thw": [[1, 2, 2]]} assert ( @@ -484,7 +484,23 @@ def parse_response(self, completion_ids, *, tools=None): "a" * 32, "image.png", ) - assert "raw_image_id" not in result["multi_modal_data"].mm_items["image"][0] + ref2 = split_raw_mm_ref(items[1]) + assert ( + ref2.run_id, + ref2.family, + ref2.fingerprint, + ref2.modality, + ref2.mm_hash, + ref2.raw_image_id, + ) == ( + "rawtest", + "qwen_vl", + fingerprint, + "image", + "b" * 32, + "image2.png", + ) + assert result["multi_modal_data"] is mm_data # --------------------------------------------------------------------------- diff --git a/tests/test_multimodal_image_layout_parity.py b/tests/test_multimodal_image_layout_parity.py new file mode 100644 index 0000000..d9869ec --- /dev/null +++ b/tests/test_multimodal_image_layout_parity.py @@ -0,0 +1,125 @@ +"""Image-layout descriptor parity against real HF processors.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +import pytest + +from renderers.image_layout_specs import KIMI_K25_IMAGE_LAYOUT, QWEN_VL_IMAGE_LAYOUT +from renderers.kimi_k25 import describe_kimi_image_layout +from renderers.qwen3_vl import describe_qwen_image_layout + +pytest.importorskip("PIL", reason="Pillow required for image layout parity tests") +pytest.importorskip("torch", reason="torch required for image layout parity tests") +pytest.importorskip( + "transformers", reason="transformers required for image layout parity tests" +) + +from PIL import Image # noqa: E402 +from transformers import AutoProcessor # noqa: E402 + + +QWEN_MODEL = "Qwen/Qwen3-VL-4B-Instruct" +KIMI_MODEL = "moonshotai/Kimi-K2.5" +KIMI_REVISION = "4d01dfe0332d63057c186e0b262165819efb6611" + +IMAGE_SIZES = [(32, 32), (64, 256), (512, 512)] + + +def _hf_snapshot_cached(model_name: str) -> bool: + cache = ( + Path(os.environ.get("HF_HOME") or Path.home() / ".cache" / "huggingface") + / "hub" + ) + snapshots = cache / ("models--" + model_name.replace("/", "--")) / "snapshots" + return snapshots.is_dir() and any(p.is_dir() for p in snapshots.iterdir()) + + +def _load_processor(model_name: str, **kwargs: Any): + if not _hf_snapshot_cached(model_name): + pytest.skip(f"{model_name}: HF snapshot not cached locally") + return AutoProcessor.from_pretrained(model_name, **kwargs) + + +def _images(): + return [ + Image.new("RGB", size, color=(64 + idx * 32, 128, 192)) + for idx, size in enumerate(IMAGE_SIZES) + ] + + +def _tensor_rows(value: Any) -> list[list[int]]: + return [[int(cell) for cell in row] for row in value.tolist()] + + +def test_qwen_image_layout_descriptor_matches_processor(): + processor = _load_processor(QWEN_MODEL) + images = _images() + messages = [ + { + "role": "user", + "content": [ + item + for idx, image in enumerate(images) + for item in ( + {"type": "image", "image": image}, + {"type": "text", "text": f"image {idx}"}, + ) + ], + } + ] + + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + processor_grids = _tensor_rows( + processor(images=images, text=text, return_tensors="pt")["image_grid_thw"] + ) + descriptors = [ + describe_qwen_image_layout({"type": "image", "image": image}) + for image in images + ] + + assert [desc.image_grid_thw[0] for desc in descriptors] == processor_grids + merge_area = QWEN_VL_IMAGE_LAYOUT.merge_size**2 + assert [desc.num_image_tokens for desc in descriptors] == [ + grid_t * grid_h * grid_w // merge_area + for grid_t, grid_h, grid_w in processor_grids + ] + + +def test_kimi_image_layout_descriptor_matches_processor(): + processor = _load_processor( + KIMI_MODEL, trust_remote_code=True, revision=KIMI_REVISION + ) + images = _images() + messages = [ + { + "role": "user", + "content": [ + item + for idx, image in enumerate(images) + for item in ( + {"type": "image_url", "image_url": image}, + {"type": "text", "text": f"image {idx}"}, + ) + ], + } + ] + + out = processor(messages=messages, add_generation_prompt=True, return_tensors="pt") + processor_grids = _tensor_rows(out["grid_thws"]) + descriptors = [ + describe_kimi_image_layout({"type": "image_url", "image_url": image}) + for image in images + ] + + assert [desc.grid_thws[0] for desc in descriptors] == processor_grids + merge_area = KIMI_K25_IMAGE_LAYOUT.merge_kernel_size**2 + assert [desc.num_media_tokens for desc in descriptors] == [ + grid_t * grid_h * grid_w // merge_area + for grid_t, grid_h, grid_w in processor_grids + ] From e97c812bd16ca3631eb4f68af1ddcf1d4e5a5248 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Sun, 28 Jun 2026 07:01:14 +0000 Subject: [PATCH 08/16] feat: support inline raw image refs --- renderers/client.py | 10 +++- renderers/mm_store.py | 126 ++++++++++++++++++++++++++++++++++-------- renderers/qwen3_vl.py | 17 +++++- tests/test_client.py | 31 +++++++++++ 4 files changed, 157 insertions(+), 27 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index 3465536..838f082 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -393,12 +393,17 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: if not isinstance(feature_modality, str) or not feature_modality: raise ValueError("raw multimodal item has invalid vllm_modality") + raw_uri = item.get("raw_uri") raw_image_id = item.get("raw_image_id") family = item.get("family") fingerprint = item.get("layout_fingerprint") payload = item.get("payload") - if not isinstance(raw_image_id, str) or not raw_image_id: - raise ValueError("raw multimodal item is missing raw_image_id") + if raw_uri is not None and not isinstance(raw_uri, str): + raise ValueError("raw multimodal item raw_uri must be a string") + if raw_image_id is not None and not isinstance(raw_image_id, str): + raise ValueError("raw multimodal item raw_image_id must be a string") + if not raw_uri and not raw_image_id: + raise ValueError("raw multimodal item is missing raw image source") if not isinstance(family, str) or not family: raise ValueError("raw multimodal item is missing family") if not isinstance(fingerprint, str) or not fingerprint: @@ -418,6 +423,7 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: modality=feature_modality, mm_hash=mm_hashes[idx], raw_image_id=raw_image_id, + raw_uri=raw_uri, payload=payload, ) ) diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 8f08eca..05cdeb6 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -20,11 +20,18 @@ RUN_OUTPUT_ROOT = Path("/data/outputs") IMAGE_OFFLOAD_DIR_ENV = "VF_RENDERER_IMAGE_OFFLOAD_DIR" +IMAGE_STORAGE_ENV = "PRIME_RL_MM_IMAGE_STORAGE" RUN_DIR_ENV = "PRIME_RL_RUN_DIR" RUN_ID_ENV = "RUN_ID" +IMAGE_STORAGE_OFFLOAD = "offload" +IMAGE_STORAGE_INLINE = "inline" +IMAGE_STORAGE_MODES = {IMAGE_STORAGE_OFFLOAD, IMAGE_STORAGE_INLINE} + IMAGE_ASSET_SUBDIR = Path("assets/images") -IMAGE_REF_PREFIX = "mmraw:v2" +IMAGE_REF_PREFIX = "mmraw" +IMAGE_REF_V2_PREFIX = "mmraw:v2" +IMAGE_REF_VERSION = "v3" IMAGE_REF_PAYLOAD_KEY = "_prime_rl_image_ref" IMAGE_REF_PAYLOAD_VALUE = "raw_image" RAW_MM_ITEM_KIND = "prime_raw_mm_item" @@ -55,6 +62,15 @@ def _ensure_safe(label: str, value: str) -> str: return value +def image_storage_mode() -> str: + mode = os.getenv(IMAGE_STORAGE_ENV, IMAGE_STORAGE_OFFLOAD).strip().lower() + if mode not in IMAGE_STORAGE_MODES: + raise ValueError( + f"{IMAGE_STORAGE_ENV} must be one of {sorted(IMAGE_STORAGE_MODES)}, got {mode!r}" + ) + return mode + + def normalize_run_id(run_id: str) -> str: """Return the canonical run id, without the directory's ``run_`` prefix.""" value = run_id.strip() @@ -92,6 +108,9 @@ def current_run_id() -> str: pass return "explicit" + if image_storage_mode() == IMAGE_STORAGE_INLINE: + return "inline" + raise RuntimeError( f"Set {IMAGE_OFFLOAD_DIR_ENV}, {RUN_DIR_ENV}, or {RUN_ID_ENV} before emitting image refs." ) @@ -235,14 +254,12 @@ def raw_mm_item( } if vllm_modality is not None: out["vllm_modality"] = vllm_modality - if raw_uri is not None and raw_image_id is not None: - out.update( - { - "raw_uri": raw_uri, - "raw_image_id": raw_image_id, - IMAGE_REF_PAYLOAD_KEY: IMAGE_REF_PAYLOAD_VALUE, - } - ) + if raw_uri is not None: + out["raw_uri"] = raw_uri + out[IMAGE_REF_PAYLOAD_KEY] = IMAGE_REF_PAYLOAD_VALUE + if raw_image_id is not None: + out["raw_image_id"] = raw_image_id + out[IMAGE_REF_PAYLOAD_KEY] = IMAGE_REF_PAYLOAD_VALUE return out @@ -253,8 +270,9 @@ class RawMMRef: fingerprint: str modality: str mm_hash: str - raw_image_id: str payload: dict[str, object] + raw_uri: str | None = None + raw_image_id: str | None = None def raw_mm_ref( @@ -264,7 +282,8 @@ def raw_mm_ref( fingerprint: str, modality: str, mm_hash: str, - raw_image_id: str, + raw_image_id: str | None = None, + raw_uri: str | None = None, payload: dict[str, object] | None = None, ) -> str: """Generic raw multimodal asset ref. @@ -277,27 +296,88 @@ def raw_mm_ref( _ensure_safe("image layout fingerprint", fingerprint) _ensure_safe("raw multimodal modality", modality) _ensure_safe("image hash", mm_hash) - raw_image_path(run_id=run_id, raw_image_id=raw_image_id) - encoded_payload = _encode_ref_payload(payload) - return ( - f"{IMAGE_REF_PREFIX}:{run_id}:{family}:{fingerprint}:" - f"{modality}:{mm_hash}:{raw_image_id}:{encoded_payload}" - ) + if raw_image_id is None and raw_uri is None: + raise ValueError("raw multimodal refs require raw_image_id or raw_uri") + if raw_image_id is not None: + raw_image_path(run_id=run_id, raw_image_id=raw_image_id) + if raw_uri is not None and not raw_uri: + raise ValueError("raw_uri must be non-empty when set") + + ref_payload: dict[str, object] = { + "run_id": run_id, + "family": family, + "fingerprint": fingerprint, + "modality": modality, + "mm_hash": mm_hash, + "payload": payload or {}, + } + if raw_image_id is not None: + ref_payload["raw_image_id"] = raw_image_id + if raw_uri is not None: + ref_payload["raw_uri"] = raw_uri + + return f"{IMAGE_REF_PREFIX}:{IMAGE_REF_VERSION}:{_encode_ref_payload(ref_payload)}" def split_raw_mm_ref(ref: str) -> RawMMRef: parts = ref.split(":") - if parts[:2] != ["mmraw", "v2"] or len(parts) != 9: + if parts[:2] == ["mmraw", "v2"] and len(parts) == 9: + run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = ( + parts[2:] + ) + return RawMMRef( + run_id=normalize_run_id(run_id), + family=_ensure_safe("multimodal family", family), + fingerprint=_ensure_safe("image layout fingerprint", fingerprint), + modality=_ensure_safe("raw multimodal modality", modality), + mm_hash=_ensure_safe("image hash", mm_hash), + payload=_decode_ref_payload(encoded_payload), + raw_image_id=_ensure_safe("raw image id", raw_image_id), + ) + + if parts[:2] != ["mmraw", IMAGE_REF_VERSION] or len(parts) != 3: raise ValueError(f"Invalid raw multimodal ref shape: {ref!r}") - run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = ( - parts[2:] - ) + + payload = _decode_ref_payload(parts[2]) + run_id = payload.get("run_id") + family = payload.get("family") + fingerprint = payload.get("fingerprint") + modality = payload.get("modality") + mm_hash = payload.get("mm_hash") + raw_uri = payload.get("raw_uri") + raw_image_id = payload.get("raw_image_id") + item_payload = payload.get("payload") + + if not isinstance(run_id, str): + raise ValueError("Raw multimodal ref is missing run_id") + if not isinstance(family, str): + raise ValueError("Raw multimodal ref is missing family") + if not isinstance(fingerprint, str): + raise ValueError("Raw multimodal ref is missing fingerprint") + if not isinstance(modality, str): + raise ValueError("Raw multimodal ref is missing modality") + if not isinstance(mm_hash, str): + raise ValueError("Raw multimodal ref is missing mm_hash") + if raw_uri is not None and not isinstance(raw_uri, str): + raise ValueError("Raw multimodal ref raw_uri must be a string") + if raw_image_id is not None and not isinstance(raw_image_id, str): + raise ValueError("Raw multimodal ref raw_image_id must be a string") + if raw_uri is None and raw_image_id is None: + raise ValueError("Raw multimodal ref is missing an image source") + if not isinstance(item_payload, dict): + raise ValueError("Raw multimodal ref payload must be a dict") + return RawMMRef( run_id=normalize_run_id(run_id), family=_ensure_safe("multimodal family", family), fingerprint=_ensure_safe("image layout fingerprint", fingerprint), modality=_ensure_safe("raw multimodal modality", modality), mm_hash=_ensure_safe("image hash", mm_hash), - raw_image_id=_ensure_safe("raw image id", raw_image_id), - payload=_decode_ref_payload(encoded_payload), + payload=item_payload, + raw_uri=raw_uri, + raw_image_id=_ensure_safe("raw image id", raw_image_id) if raw_image_id is not None else None, ) + + +def is_raw_mm_ref(ref: object) -> bool: + return isinstance(ref, str) and ref.startswith(f"{IMAGE_REF_PREFIX}:") diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 7e41d0d..faa1ac3 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -141,8 +141,7 @@ def _load_pil_image(item: dict[str, Any]): if raw.startswith("data:"): # data:image/png;base64,XXXX - _, _, payload = raw.partition(",") - return Image.open(io.BytesIO(base64.b64decode(payload))).convert("RGB") + return Image.open(io.BytesIO(_data_image_bytes(raw))).convert("RGB") parsed = urlparse(raw) if parsed.scheme in ("http", "https"): @@ -170,6 +169,16 @@ def _image_hash(pil_image) -> str: return h.hexdigest()[:32] +def _data_image_bytes(source: str) -> bytes: + if not source.startswith("data:image/"): + raise ValueError(f"Expected data:image URI, got {source!r}") + marker = ";base64," + if marker not in source: + raise ValueError("data:image URI must use base64 encoding") + _, b64 = source.split(marker, 1) + return base64.b64decode(b64) + + @dataclass(frozen=True) class QwenImageLayoutDescriptor: mm_hash: str @@ -250,10 +259,14 @@ def _image_content_hash(source: Any) -> str: path = _file_path_from_source(source) if path is not None: return hashlib.sha256(path.read_bytes()).hexdigest()[:32] + if isinstance(source, str) and source.startswith("data:image/"): + return hashlib.sha256(_data_image_bytes(source)).hexdigest()[:32] return _image_hash(_load_pil_image({"image": source})) def _raw_uri_and_id(source: Any) -> tuple[str | None, str | None]: + if isinstance(source, str) and source.startswith("data:image/"): + return source, None path = _file_path_from_source(source) if path is None: return None, None diff --git a/tests/test_client.py b/tests/test_client.py index 18dfa25..505ac32 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,5 +1,6 @@ import asyncio import base64 +import hashlib import json from typing import Any @@ -181,6 +182,36 @@ def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): assert rendered.multi_modal_data.mm_placeholders["image"][0].length == 64 +def test_qwen3_vl_render_preserves_inline_data_uri_raw_source(tmp_path): + pytest.importorskip("PIL") + from PIL import Image + from renderers.qwen3_vl import Qwen3VLRenderer + + image_path = tmp_path / "image.png" + Image.new("RGB", (32, 32), color=(0, 255, 0)).save(image_path) + raw = image_path.read_bytes() + data_uri = f"data:image/png;base64,{base64.b64encode(raw).decode('ascii')}" + renderer = Qwen3VLRenderer(_TinyQwenTokenizer()) + + rendered = renderer.render( + [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": data_uri}} + ], + } + ], + add_generation_prompt=True, + ) + + assert rendered.multi_modal_data is not None + item = rendered.multi_modal_data.mm_items["image"][0] + assert item["raw_uri"] == data_uri + assert "raw_image_id" not in item + assert rendered.multi_modal_data.mm_hashes["image"][0] == hashlib.sha256(raw).hexdigest()[:32] + + def test_generate_builds_request_body_and_parses_response(): From 673b790916ec70efe89526a66b3fa2eed0b88bd6 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Mon, 29 Jun 2026 06:08:48 +0000 Subject: [PATCH 09/16] Clean up raw multimodal offload renderers --- renderers/client.py | 15 +- renderers/image_layout_specs.py | 33 --- renderers/kimi_k25.py | 29 ++- renderers/mm_store.py | 173 ++------------ renderers/qwen3_vl.py | 132 +++-------- tests/test_client.py | 225 ++++--------------- tests/test_multimodal.py | 74 ++++-- tests/test_multimodal_image_layout_parity.py | 125 ----------- 8 files changed, 163 insertions(+), 643 deletions(-) delete mode 100644 renderers/image_layout_specs.py delete mode 100644 tests/test_multimodal_image_layout_parity.py diff --git a/renderers/client.py b/renderers/client.py index 838f082..6132ad1 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -347,8 +347,6 @@ def _features_and_descriptor_mm() -> tuple[ } - - def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: """Serialize ``MultiModalData`` to vLLM's ``/inference/v1/generate`` features payload. @@ -360,7 +358,6 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: """ from renderers.mm_store import ( RAW_MM_ITEM_KIND, - current_run_id, raw_mm_ref, ) @@ -370,7 +367,6 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: "kwargs_data": {}, } - run_id = current_run_id() for source_modality, items in mm_data.mm_items.items(): if not items: continue @@ -393,17 +389,12 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: if not isinstance(feature_modality, str) or not feature_modality: raise ValueError("raw multimodal item has invalid vllm_modality") - raw_uri = item.get("raw_uri") raw_image_id = item.get("raw_image_id") family = item.get("family") fingerprint = item.get("layout_fingerprint") payload = item.get("payload") - if raw_uri is not None and not isinstance(raw_uri, str): - raise ValueError("raw multimodal item raw_uri must be a string") - if raw_image_id is not None and not isinstance(raw_image_id, str): - raise ValueError("raw multimodal item raw_image_id must be a string") - if not raw_uri and not raw_image_id: - raise ValueError("raw multimodal item is missing raw image source") + if not isinstance(raw_image_id, str) or not raw_image_id: + raise ValueError("raw multimodal item is missing raw_image_id") if not isinstance(family, str) or not family: raise ValueError("raw multimodal item is missing family") if not isinstance(fingerprint, str) or not fingerprint: @@ -417,13 +408,11 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: ) out["kwargs_data"].setdefault(feature_modality, []).append( raw_mm_ref( - run_id=run_id, family=family, fingerprint=fingerprint, modality=feature_modality, mm_hash=mm_hashes[idx], raw_image_id=raw_image_id, - raw_uri=raw_uri, payload=payload, ) ) diff --git a/renderers/image_layout_specs.py b/renderers/image_layout_specs.py deleted file mode 100644 index c233eea..0000000 --- a/renderers/image_layout_specs.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Static multimodal image layout contracts mirrored from model processors.""" - -from __future__ import annotations - -from dataclasses import dataclass - - -@dataclass(frozen=True) -class QwenVLImageLayoutSpec: - """Qwen-VL image processor values needed for raw descriptor layout math.""" - - patch_size: int = 16 - temporal_patch_size: int = 2 - merge_size: int = 2 - min_pixels: int = 65536 - max_pixels: int = 16777216 - - -@dataclass(frozen=True) -class KimiK25ImageLayoutSpec: - """Kimi K2.5 image processor values needed for raw descriptor layout math.""" - - patch_size: int = 14 - merge_kernel_size: int = 2 - in_patch_limit: int = 16384 - patch_limit_on_one_side: int = 512 - fixed_output_tokens: int | None = None - image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5) - image_std: tuple[float, float, float] = (0.5, 0.5, 0.5) - - -QWEN_VL_IMAGE_LAYOUT = QwenVLImageLayoutSpec() -KIMI_K25_IMAGE_LAYOUT = KimiK25ImageLayoutSpec() diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 0ab391d..741c8ba 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -45,7 +45,6 @@ trim_to_turn_close, ) from renderers.configs import KimiK25RendererConfig -from renderers.image_layout_specs import KIMI_K25_IMAGE_LAYOUT, KimiK25ImageLayoutSpec from renderers.parsing import _reasoning_end_token_index, parse_kimi_k2_section from renderers.qwen3_vl import ( _image_content_hash, @@ -53,7 +52,7 @@ _image_source, _is_image_part, _is_video_part, - _raw_uri_and_id, + _raw_image_id, ) from renderers.mm_store import image_layout_fingerprint, raw_mm_item @@ -411,14 +410,27 @@ def _encode_tools_typescript(tools: list[ToolSpec]) -> str: return "# Tools\n\n## functions\nnamespace functions {\n" + functions_str + "\n}\n" +@dataclass(frozen=True) +class KimiK25ImageLayoutSpec: + patch_size: int = 14 + merge_kernel_size: int = 2 + in_patch_limit: int = 16384 + patch_limit_on_one_side: int = 512 + fixed_output_tokens: int | None = None + image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5) + image_std: tuple[float, float, float] = (0.5, 0.5, 0.5) + + +KIMI_K25_IMAGE_LAYOUT = KimiK25ImageLayoutSpec() + + @dataclass(frozen=True) class KimiImageLayoutDescriptor: mm_hash: str grid_thws: list[list[int]] num_media_tokens: int fingerprint: str - raw_uri: str | None = None - raw_image_id: str | None = None + raw_image_id: str def _ceil_to_factor(value: int, factor: int) -> int: @@ -469,14 +481,12 @@ def describe_kimi_image_layout(part: dict[str, Any]) -> KimiImageLayoutDescripto image_mean=list(layout.image_mean), image_std=list(layout.image_std), ) - raw_uri, raw_image_id = _raw_uri_and_id(source) return KimiImageLayoutDescriptor( mm_hash=_image_content_hash(source), grid_thws=grid_thws, num_media_tokens=num_media_tokens, fingerprint=fingerprint, - raw_uri=raw_uri, - raw_image_id=raw_image_id, + raw_image_id=_raw_image_id(source), ) @@ -490,7 +500,6 @@ def kimi_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str "grid_thws": desc.grid_thws, "num_media_tokens": desc.num_media_tokens, }, - raw_uri=desc.raw_uri, raw_image_id=desc.raw_image_id, vllm_modality=KIMI_K25_VLLM_MODALITY, ) @@ -730,7 +739,6 @@ def __init__( # The stop token for generation self._endoftext: int | None = self._try_token_id("<|endoftext|>") - @property def mm_token_type_id_map(self) -> dict[int, int]: """Token-id → modality marker. For Kimi K2.5 only ``<|media_pad|>`` @@ -738,9 +746,6 @@ def mm_token_type_id_map(self) -> dict[int, int]: internally from ``pixel_values``.""" return {self._media_pad: 1} - - - # ------------------------------------------------------------------ # Token helpers # ------------------------------------------------------------------ diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 05cdeb6..53c69dc 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -17,28 +17,14 @@ from dataclasses import dataclass from pathlib import Path -RUN_OUTPUT_ROOT = Path("/data/outputs") - IMAGE_OFFLOAD_DIR_ENV = "VF_RENDERER_IMAGE_OFFLOAD_DIR" -IMAGE_STORAGE_ENV = "PRIME_RL_MM_IMAGE_STORAGE" -RUN_DIR_ENV = "PRIME_RL_RUN_DIR" -RUN_ID_ENV = "RUN_ID" - -IMAGE_STORAGE_OFFLOAD = "offload" -IMAGE_STORAGE_INLINE = "inline" -IMAGE_STORAGE_MODES = {IMAGE_STORAGE_OFFLOAD, IMAGE_STORAGE_INLINE} -IMAGE_ASSET_SUBDIR = Path("assets/images") IMAGE_REF_PREFIX = "mmraw" -IMAGE_REF_V2_PREFIX = "mmraw:v2" IMAGE_REF_VERSION = "v3" -IMAGE_REF_PAYLOAD_KEY = "_prime_rl_image_ref" -IMAGE_REF_PAYLOAD_VALUE = "raw_image" RAW_MM_ITEM_KIND = "prime_raw_mm_item" RAW_MM_ITEM_VERSION = 1 _SAFE = { - "run id": re.compile(r"^[A-Za-z0-9_.-]+$"), "multimodal family": re.compile(r"^[A-Za-z0-9_.-]+$"), "raw multimodal modality": re.compile(r"^[A-Za-z0-9_.-]+$"), "image layout fingerprint": re.compile(r"^[a-f0-9]{16,64}$"), @@ -62,85 +48,14 @@ def _ensure_safe(label: str, value: str) -> str: return value -def image_storage_mode() -> str: - mode = os.getenv(IMAGE_STORAGE_ENV, IMAGE_STORAGE_OFFLOAD).strip().lower() - if mode not in IMAGE_STORAGE_MODES: - raise ValueError( - f"{IMAGE_STORAGE_ENV} must be one of {sorted(IMAGE_STORAGE_MODES)}, got {mode!r}" - ) - return mode - - -def normalize_run_id(run_id: str) -> str: - """Return the canonical run id, without the directory's ``run_`` prefix.""" - value = run_id.strip() - if value.startswith("run_"): - value = value[len("run_") :] - if not value: - raise ValueError(f"Invalid run id: {run_id!r}") - return _ensure_safe("run id", value) - - -def run_dir_name(run_id: str) -> str: - return f"run_{normalize_run_id(run_id)}" - - -def current_run_id() -> str: - """Best-effort run id for refs emitted by this process.""" - raw = os.getenv(RUN_ID_ENV, "").strip() - if raw: - return normalize_run_id(raw) - - run_dir = os.getenv(RUN_DIR_ENV, "").strip() - if run_dir: - return normalize_run_id(Path(run_dir).name) - - image_dir = os.getenv(IMAGE_OFFLOAD_DIR_ENV, "").strip() - if image_dir: - # Expected shape is /assets/images. If callers pass another - # explicit directory, the ref's run segment is only a stable label; the - # path resolver will use the explicit directory in every process. - path = Path(image_dir).resolve() - if path.name == "images" and path.parent.name == "assets": - try: - return normalize_run_id(path.parent.parent.name) - except ValueError: - pass - return "explicit" - - if image_storage_mode() == IMAGE_STORAGE_INLINE: - return "inline" - - raise RuntimeError( - f"Set {IMAGE_OFFLOAD_DIR_ENV}, {RUN_DIR_ENV}, or {RUN_ID_ENV} before emitting image refs." - ) - - -def run_dir(run_id: str | None = None) -> Path: - """Resolve the run output directory. - - Resolution order: - 1. ``PRIME_RL_RUN_DIR`` as an exact run directory. - 2. ``RUN_ID`` or explicit ``run_id`` under ``/data/outputs/run_``. - """ - explicit = os.getenv(RUN_DIR_ENV, "").strip() - if explicit: - return Path(explicit).resolve() - - value = run_id or os.getenv(RUN_ID_ENV, "").strip() - if not value: - raise RuntimeError( - f"Set {RUN_DIR_ENV} or {RUN_ID_ENV} before resolving a run directory." - ) - return (RUN_OUTPUT_ROOT / run_dir_name(value)).resolve() - - -def run_image_dir(run_id: str | None = None) -> Path: +def run_image_dir() -> Path: """Resolve the directory for raw image assets for a run.""" explicit = os.getenv(IMAGE_OFFLOAD_DIR_ENV, "").strip() if explicit: return Path(explicit).resolve() - return (run_dir(run_id) / IMAGE_ASSET_SUBDIR).resolve() + raise RuntimeError( + f"Set {IMAGE_OFFLOAD_DIR_ENV} before resolving raw image assets." + ) def _media_type_ext(media_type: str) -> str: @@ -150,11 +65,11 @@ def _media_type_ext(media_type: str) -> str: def offload_image_to_run_assets( url: object, image_dir: Path | None = None -) -> tuple[str, int] | None: +) -> str | None: """Decode a base64 data image into the run image assets directory. - Returns ``(file_url, byte_count)`` when ``url`` was rewritten and ``None`` - for non-data-image values. Writes are content-addressed and atomic. + Returns a ``file://`` URL when ``url`` was rewritten and ``None`` for + non-data-image values. Writes are content-addressed and atomic. """ if not isinstance(url, str) or not url.startswith("data:image/"): return None @@ -181,12 +96,12 @@ def offload_image_to_run_assets( path.touch() except OSError: pass - return path.as_uri(), len(raw) + return path.as_uri() -def raw_image_path(*, run_id: str, raw_image_id: str) -> Path: +def raw_image_path(*, raw_image_id: str) -> Path: _ensure_safe("raw image id", raw_image_id) - root = run_image_dir(run_id) + root = run_image_dir() path = (root / raw_image_id).resolve() if not path.is_relative_to(root): raise ValueError(f"Raw image path escaped root: {path}") @@ -231,8 +146,7 @@ def raw_mm_item( family: str, layout_fingerprint: str, payload: dict[str, object], - raw_uri: str | None = None, - raw_image_id: str | None = None, + raw_image_id: str, vllm_modality: str | None = None, ) -> dict[str, object]: """Build the JSON-safe raw multimodal descriptor envelope. @@ -254,36 +168,27 @@ def raw_mm_item( } if vllm_modality is not None: out["vllm_modality"] = vllm_modality - if raw_uri is not None: - out["raw_uri"] = raw_uri - out[IMAGE_REF_PAYLOAD_KEY] = IMAGE_REF_PAYLOAD_VALUE - if raw_image_id is not None: - out["raw_image_id"] = raw_image_id - out[IMAGE_REF_PAYLOAD_KEY] = IMAGE_REF_PAYLOAD_VALUE + out["raw_image_id"] = _ensure_safe("raw image id", raw_image_id) return out @dataclass(frozen=True) class RawMMRef: - run_id: str family: str fingerprint: str modality: str mm_hash: str payload: dict[str, object] - raw_uri: str | None = None - raw_image_id: str | None = None + raw_image_id: str def raw_mm_ref( *, - run_id: str, family: str, fingerprint: str, modality: str, mm_hash: str, - raw_image_id: str | None = None, - raw_uri: str | None = None, + raw_image_id: str, payload: dict[str, object] | None = None, ) -> str: """Generic raw multimodal asset ref. @@ -291,65 +196,37 @@ def raw_mm_ref( Adapter-owned details stay in the descriptor payload so refs can serve future families without baking shape names into the wire id. """ - run_id = normalize_run_id(run_id) _ensure_safe("multimodal family", family) _ensure_safe("image layout fingerprint", fingerprint) _ensure_safe("raw multimodal modality", modality) _ensure_safe("image hash", mm_hash) - if raw_image_id is None and raw_uri is None: - raise ValueError("raw multimodal refs require raw_image_id or raw_uri") - if raw_image_id is not None: - raw_image_path(run_id=run_id, raw_image_id=raw_image_id) - if raw_uri is not None and not raw_uri: - raise ValueError("raw_uri must be non-empty when set") + raw_image_path(raw_image_id=raw_image_id) ref_payload: dict[str, object] = { - "run_id": run_id, "family": family, "fingerprint": fingerprint, "modality": modality, "mm_hash": mm_hash, "payload": payload or {}, + "raw_image_id": raw_image_id, } - if raw_image_id is not None: - ref_payload["raw_image_id"] = raw_image_id - if raw_uri is not None: - ref_payload["raw_uri"] = raw_uri return f"{IMAGE_REF_PREFIX}:{IMAGE_REF_VERSION}:{_encode_ref_payload(ref_payload)}" def split_raw_mm_ref(ref: str) -> RawMMRef: parts = ref.split(":") - if parts[:2] == ["mmraw", "v2"] and len(parts) == 9: - run_id, family, fingerprint, modality, mm_hash, raw_image_id, encoded_payload = ( - parts[2:] - ) - return RawMMRef( - run_id=normalize_run_id(run_id), - family=_ensure_safe("multimodal family", family), - fingerprint=_ensure_safe("image layout fingerprint", fingerprint), - modality=_ensure_safe("raw multimodal modality", modality), - mm_hash=_ensure_safe("image hash", mm_hash), - payload=_decode_ref_payload(encoded_payload), - raw_image_id=_ensure_safe("raw image id", raw_image_id), - ) - - if parts[:2] != ["mmraw", IMAGE_REF_VERSION] or len(parts) != 3: + if parts[:2] != [IMAGE_REF_PREFIX, IMAGE_REF_VERSION] or len(parts) != 3: raise ValueError(f"Invalid raw multimodal ref shape: {ref!r}") payload = _decode_ref_payload(parts[2]) - run_id = payload.get("run_id") family = payload.get("family") fingerprint = payload.get("fingerprint") modality = payload.get("modality") mm_hash = payload.get("mm_hash") - raw_uri = payload.get("raw_uri") raw_image_id = payload.get("raw_image_id") item_payload = payload.get("payload") - if not isinstance(run_id, str): - raise ValueError("Raw multimodal ref is missing run_id") if not isinstance(family, str): raise ValueError("Raw multimodal ref is missing family") if not isinstance(fingerprint, str): @@ -358,26 +235,22 @@ def split_raw_mm_ref(ref: str) -> RawMMRef: raise ValueError("Raw multimodal ref is missing modality") if not isinstance(mm_hash, str): raise ValueError("Raw multimodal ref is missing mm_hash") - if raw_uri is not None and not isinstance(raw_uri, str): - raise ValueError("Raw multimodal ref raw_uri must be a string") - if raw_image_id is not None and not isinstance(raw_image_id, str): - raise ValueError("Raw multimodal ref raw_image_id must be a string") - if raw_uri is None and raw_image_id is None: - raise ValueError("Raw multimodal ref is missing an image source") + if not isinstance(raw_image_id, str): + raise ValueError("Raw multimodal ref is missing raw_image_id") if not isinstance(item_payload, dict): raise ValueError("Raw multimodal ref payload must be a dict") return RawMMRef( - run_id=normalize_run_id(run_id), family=_ensure_safe("multimodal family", family), fingerprint=_ensure_safe("image layout fingerprint", fingerprint), modality=_ensure_safe("raw multimodal modality", modality), mm_hash=_ensure_safe("image hash", mm_hash), payload=item_payload, - raw_uri=raw_uri, - raw_image_id=_ensure_safe("raw image id", raw_image_id) if raw_image_id is not None else None, + raw_image_id=_ensure_safe("raw image id", raw_image_id), ) def is_raw_mm_ref(ref: object) -> bool: - return isinstance(ref, str) and ref.startswith(f"{IMAGE_REF_PREFIX}:") + return isinstance(ref, str) and ref.startswith( + f"{IMAGE_REF_PREFIX}:{IMAGE_REF_VERSION}:" + ) diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index faa1ac3..961d145 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -1,4 +1,4 @@ -"""Qwen3-VL renderer with multimodal (image + video) support. +"""Qwen3-VL renderer with multimodal image support. Produces a token stream that matches ``Qwen3VLProcessor.apply_chat_template`` byte-for-byte for text-only inputs and emits the same @@ -19,13 +19,14 @@ tokens (``<|im_start|>``, ``<|im_end|>``, ````, ``<|vision_start|>``…), which act as atomic boundaries the template also can't merge across. + +Video-shaped content parts are detected and rejected explicitly; video +materialization is not implemented in this raw-image path yet. """ from __future__ import annotations -import base64 import hashlib -import io import json import math from dataclasses import dataclass @@ -50,7 +51,6 @@ trim_to_turn_close, ) from renderers.configs import Qwen3VLRendererConfig -from renderers.image_layout_specs import QWEN_VL_IMAGE_LAYOUT from renderers.mm_store import ( image_layout_fingerprint, raw_mm_item, @@ -101,82 +101,16 @@ def _is_video_part(item: Any) -> bool: return bool(item.get("video")) or bool(item.get("video_url")) -def _load_pil_image(item: dict[str, Any]): - """Resolve an ImagePart to a PIL Image. - - Accepts pre-loaded PIL Images, raw bytes, filesystem paths, - ``file://``/``http(s)://`` URLs, and ``data:image/...;base64,...`` URIs. - """ - try: - from PIL import Image - except ImportError as exc: - raise RuntimeError( - "Pillow is required for multimodal rendering. Install with " - "`pip install Pillow` (or `pip install renderers[multimodal]`)." - ) from exc - - raw: Any - if "image" in item: - raw = item["image"] - elif "image_url" in item: - # OpenAI canonical shape is ``image_url: {"url": "..."}`` — but - # some VLM processors (Kimi K2.5 / K2.6) hand a raw PIL / str - # directly under ``image_url``. Accept both. - iu = item.get("image_url") - raw = iu.get("url") if isinstance(iu, dict) else iu - else: - raw = item.get("url") or item.get("path") - - if isinstance(raw, Image.Image): - return raw.convert("RGB") if raw.mode != "RGB" else raw - - if isinstance(raw, (bytes, bytearray)): - return Image.open(io.BytesIO(raw)).convert("RGB") - - if not isinstance(raw, str): - raise TypeError( - f"Unsupported image source {type(raw).__name__!r}; expected PIL " - "Image, bytes, path, http(s):// URL, file:// URL, or data: URI." - ) - - if raw.startswith("data:"): - # data:image/png;base64,XXXX - return Image.open(io.BytesIO(_data_image_bytes(raw))).convert("RGB") - - parsed = urlparse(raw) - if parsed.scheme in ("http", "https"): - import urllib.request - - with urllib.request.urlopen(raw) as resp: # noqa: S310 — user-supplied URL - return Image.open(io.BytesIO(resp.read())).convert("RGB") - - if parsed.scheme == "file" or parsed.scheme == "": - path = parsed.path if parsed.scheme == "file" else raw - return Image.open(path).convert("RGB") - - raise ValueError(f"Unsupported image URL scheme: {parsed.scheme!r} in {raw!r}") - - -def _image_hash(pil_image) -> str: - """Stable per-image identifier for cache lookup. - - Uses the resolved RGB bytes so two ``ImagePart``\\s pointing at the - same logical image (path, in-memory, data URI) hash identically. - """ - h = hashlib.sha256() - h.update(pil_image.tobytes()) - h.update(f"{pil_image.size}".encode()) - return h.hexdigest()[:32] +@dataclass(frozen=True) +class QwenVLImageLayoutSpec: + patch_size: int = 16 + temporal_patch_size: int = 2 + merge_size: int = 2 + min_pixels: int = 65536 + max_pixels: int = 16777216 -def _data_image_bytes(source: str) -> bytes: - if not source.startswith("data:image/"): - raise ValueError(f"Expected data:image URI, got {source!r}") - marker = ";base64," - if marker not in source: - raise ValueError("data:image URI must use base64 encoding") - _, b64 = source.split(marker, 1) - return base64.b64decode(b64) +QWEN_VL_IMAGE_LAYOUT = QwenVLImageLayoutSpec() @dataclass(frozen=True) @@ -185,8 +119,7 @@ class QwenImageLayoutDescriptor: image_grid_thw: list[list[int]] num_image_tokens: int fingerprint: str - raw_uri: str | None = None - raw_image_id: str | None = None + raw_image_id: str def _smart_resize( @@ -238,6 +171,15 @@ def _file_path_from_source(source: Any) -> Path | None: return None +def _offloaded_image_path(source: Any) -> Path: + path = _file_path_from_source(source) + if path is None: + raise ValueError( + "v1 multimodal image rendering requires offloaded file:// image assets" + ) + return path + + def _image_dimensions(source: Any) -> tuple[int, int]: try: from PIL import Image @@ -246,31 +188,16 @@ def _image_dimensions(source: Any) -> tuple[int, int]: "Pillow is required to read image dimensions for multimodal rendering." ) from exc - path = _file_path_from_source(source) - if path is not None: - with Image.open(path) as image: - return image.height, image.width - - image = _load_pil_image({"image": source}) - return image.height, image.width + with Image.open(_offloaded_image_path(source)) as image: + return image.height, image.width def _image_content_hash(source: Any) -> str: - path = _file_path_from_source(source) - if path is not None: - return hashlib.sha256(path.read_bytes()).hexdigest()[:32] - if isinstance(source, str) and source.startswith("data:image/"): - return hashlib.sha256(_data_image_bytes(source)).hexdigest()[:32] - return _image_hash(_load_pil_image({"image": source})) + return hashlib.sha256(_offloaded_image_path(source).read_bytes()).hexdigest()[:32] -def _raw_uri_and_id(source: Any) -> tuple[str | None, str | None]: - if isinstance(source, str) and source.startswith("data:image/"): - return source, None - path = _file_path_from_source(source) - if path is None: - return None, None - return path.as_uri(), path.name +def _raw_image_id(source: Any) -> str: + return _offloaded_image_path(source).name def describe_qwen_image_layout(part: dict[str, Any]) -> QwenImageLayoutDescriptor: @@ -299,14 +226,12 @@ def describe_qwen_image_layout(part: dict[str, Any]) -> QwenImageLayoutDescripto min_pixels=layout.min_pixels, max_pixels=layout.max_pixels, ) - raw_uri, raw_image_id = _raw_uri_and_id(source) return QwenImageLayoutDescriptor( mm_hash=_image_content_hash(source), image_grid_thw=[[grid_t, grid_h, grid_w]], num_image_tokens=num_image_tokens, fingerprint=fingerprint, - raw_uri=raw_uri, - raw_image_id=raw_image_id, + raw_image_id=_raw_image_id(source), ) @@ -317,7 +242,6 @@ def qwen_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str family="qwen_vl", layout_fingerprint=desc.fingerprint, payload={"image_grid_thw": desc.image_grid_thw}, - raw_uri=desc.raw_uri, raw_image_id=desc.raw_image_id, ) return desc.num_image_tokens, desc.mm_hash, item diff --git a/tests/test_client.py b/tests/test_client.py index 505ac32..967fe6d 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -2,7 +2,6 @@ import base64 import hashlib import json -from typing import Any import httpx import numpy as np @@ -15,7 +14,7 @@ ) from renderers.client import generate -_OPENAI_TOOL: Any = {"type": "function", "function": {"name": "echo"}} +_OPENAI_TOOL = {"type": "function", "function": {"name": "echo"}} class _FakeRenderer: @@ -105,113 +104,19 @@ async def post(self, path, *, cast_to=dict, body=None, options=None): ) -def test_run_image_dir_resolution_prefers_explicit_image_dir(tmp_path, monkeypatch): - from renderers.mm_store import run_image_dir +def test_offload_image_to_run_assets_writes_content_addressed_file(tmp_path): + from renderers.mm_store import offload_image_to_run_assets - image_dir = tmp_path / "custom-images" - monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) - monkeypatch.setenv("PRIME_RL_RUN_DIR", str(tmp_path / "run_other")) - monkeypatch.setenv("RUN_ID", "other") - - assert run_image_dir() == image_dir.resolve() - - -def test_run_image_dir_resolution_owns_run_prefix(monkeypatch): - from renderers.mm_store import run_image_dir - - monkeypatch.delenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", raising=False) - monkeypatch.delenv("PRIME_RL_RUN_DIR", raising=False) - monkeypatch.setenv("RUN_ID", "run_abc") - - assert run_image_dir().as_posix() == "/data/outputs/run_abc/assets/images" - - -class _TinyQwenTokenizer: - unk_token_id = -1 - _specials = { - "<|im_start|>": 1, - "<|im_end|>": 2, - "<|endoftext|>": 3, - "": 4, - "": 5, - "": 6, - "": 7, - "": 8, - "<|vision_start|>": 9, - "<|vision_end|>": 10, - "<|image_pad|>": 11, - "<|video_pad|>": 12, - } - - def convert_tokens_to_ids(self, token): - return self._specials.get(token, self.unk_token_id) - - def encode(self, text, add_special_tokens=False): - return [100 + ord(ch) % 50 for ch in text] - - -def test_qwen3_vl_render_emits_image_descriptor_without_processor(tmp_path): - pytest.importorskip("PIL") - from PIL import Image - from renderers.mm_store import IMAGE_REF_PAYLOAD_KEY, IMAGE_REF_PAYLOAD_VALUE - from renderers.qwen3_vl import Qwen3VLRenderer - - image_path = tmp_path / "image.png" - Image.new("RGB", (32, 32), color=(255, 0, 0)).save(image_path) - renderer = Qwen3VLRenderer(_TinyQwenTokenizer()) - - rendered = renderer.render( - [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_path.as_uri()}} - ], - } - ], - add_generation_prompt=True, - ) - - assert rendered.multi_modal_data is not None - item = rendered.multi_modal_data.mm_items["image"][0] - assert "pixel_values" not in item - assert item["family"] == "qwen_vl" - assert item["payload"]["image_grid_thw"] == [[1, 16, 16]] - assert item["raw_image_id"] == "image.png" - assert item[IMAGE_REF_PAYLOAD_KEY] == IMAGE_REF_PAYLOAD_VALUE - assert rendered.multi_modal_data.mm_placeholders["image"][0].length == 64 - - -def test_qwen3_vl_render_preserves_inline_data_uri_raw_source(tmp_path): - pytest.importorskip("PIL") - from PIL import Image - from renderers.qwen3_vl import Qwen3VLRenderer - - image_path = tmp_path / "image.png" - Image.new("RGB", (32, 32), color=(0, 255, 0)).save(image_path) - raw = image_path.read_bytes() - data_uri = f"data:image/png;base64,{base64.b64encode(raw).decode('ascii')}" - renderer = Qwen3VLRenderer(_TinyQwenTokenizer()) - - rendered = renderer.render( - [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": data_uri}} - ], - } - ], - add_generation_prompt=True, - ) - - assert rendered.multi_modal_data is not None - item = rendered.multi_modal_data.mm_items["image"][0] - assert item["raw_uri"] == data_uri - assert "raw_image_id" not in item - assert rendered.multi_modal_data.mm_hashes["image"][0] == hashlib.sha256(raw).hexdigest()[:32] + raw = b"png-ish bytes" + url = "data:image/png;base64," + base64.b64encode(raw).decode("ascii") + file_url = offload_image_to_run_assets(url, image_dir=tmp_path) + assert file_url is not None + assert file_url.startswith("file://") + path = tmp_path / file_url.rsplit("/", 1)[-1] + assert path.name == f"{hashlib.sha256(raw).hexdigest()[:16]}.png" + assert path.read_bytes() == raw def test_generate_builds_request_body_and_parses_response(): @@ -392,25 +297,28 @@ def test_generate_threads_prompt_attribution_through_prebuilt_prompt_path(): @pytest.mark.parametrize( - "renderer_class_path", + "family,payload,expected_modality,vllm_modality", [ - "renderers.qwen3_vl:Qwen3VLRenderer", - "renderers.qwen35:Qwen35Renderer", + ("qwen_vl", {"image_grid_thw": [[1, 2, 2]]}, "image", None), + ( + "kimi_k25", + {"grid_thws": [[1, 2, 2]], "num_media_tokens": 1}, + "vision_chunk", + "vision_chunk", + ), ], - ids=["qwen3_vl", "qwen35"], + ids=["default_image_modality", "kimi_vllm_modality"], ) -def test_generate_serializes_image_refs_for_qwen_vl_family( - tmp_path, monkeypatch, renderer_class_path +def test_generate_serializes_raw_mm_refs( + tmp_path, monkeypatch, family, payload, expected_modality, vllm_modality ): - """When the renderer emits ``MultiModalData``, ``generate`` translates - it into vLLM's ``features`` payload (mm_hashes + mm_placeholders + - image-ref kwargs_data) and sticks it in the request body. Every image slot - carries a lightweight raw ref.""" - import importlib + """``generate`` serializes raw multimodal envelopes to vLLM refs. + The client owns only the generic wire shape: hashes, placeholder spans, + and one raw ref per item. Family-specific payload keys stay opaque here. + """ from renderers.base import ( MultiModalData, - ParsedResponse, PlaceholderRange, ) from renderers.mm_store import ( @@ -419,59 +327,29 @@ def test_generate_serializes_image_refs_for_qwen_vl_family( split_raw_mm_ref, ) - mod_name, cls_name = renderer_class_path.split(":") - renderer_cls = getattr(importlib.import_module(mod_name), cls_name) - - class _BareRenderer(renderer_cls): - supports_tools = True - - def get_stop_token_ids(self): - return [99] - - def parse_response(self, completion_ids, *, tools=None): - return ParsedResponse(content="done") - - renderer = _BareRenderer.__new__(_BareRenderer) image_dir = tmp_path / "run_rawtest" / "assets" / "images" image_dir.mkdir(parents=True) (image_dir / "image.png").write_bytes(b"image-bytes") - (image_dir / "image2.png").write_bytes(b"other-image-bytes") monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) - monkeypatch.setenv("RUN_ID", "rawtest") - fingerprint = image_layout_fingerprint( - family="qwen_vl", - patch_size=16, - merge_size=2, - temporal_patch_size=2, - min_pixels=65536, - max_pixels=16777216, - ) + fingerprint = image_layout_fingerprint(family=family, revision="test") + mm_hash = "a" * 32 mm_data = MultiModalData( - mm_hashes={"image": ["a" * 32, "b" * 32]}, + mm_hashes={"image": [mm_hash]}, mm_placeholders={ "image": [ PlaceholderRange(offset=5, length=1), - PlaceholderRange(offset=10, length=1), ] }, mm_items={ "image": [ raw_mm_item( modality="image", - family="qwen_vl", + family=family, layout_fingerprint=fingerprint, - payload={"image_grid_thw": [[1, 2, 2]]}, - raw_uri=(image_dir / "image.png").as_uri(), + payload=payload, raw_image_id="image.png", - ), - raw_mm_item( - modality="image", - family="qwen_vl", - layout_fingerprint=fingerprint, - payload={"image_grid_thw": [[1, 2, 2]]}, - raw_uri=(image_dir / "image2.png").as_uri(), - raw_image_id="image2.png", + vllm_modality=vllm_modality, ), ], }, @@ -481,9 +359,9 @@ def parse_response(self, completion_ids, *, tools=None): result = asyncio.run( generate( client=client, - renderer=renderer, + renderer=_NoRenderRenderer(), messages=[], - model="qwen3-vl", + model="test-model", prompt_ids=list(range(20)), multi_modal_data=mm_data, sampling_params={"max_tokens": 4}, @@ -493,44 +371,27 @@ def parse_response(self, completion_ids, *, tools=None): body = client.calls[0]["body"] assert "features" in body, "multimodal call should attach features" features = body["features"] - assert features["mm_hashes"] == {"image": ["a" * 32, "b" * 32]} + assert features["mm_hashes"] == {expected_modality: [mm_hash]} assert features["mm_placeholders"] == { - "image": [{"offset": 5, "length": 1}, {"offset": 10, "length": 1}], + expected_modality: [{"offset": 5, "length": 1}], } - items = features["kwargs_data"]["image"] - ref = split_raw_mm_ref(items[0]) - assert ref.payload == {"image_grid_thw": [[1, 2, 2]]} + refs = features["kwargs_data"][expected_modality] + assert len(refs) == 1 + ref = split_raw_mm_ref(refs[0]) + assert ref.payload == payload assert ( - ref.run_id, ref.family, ref.fingerprint, ref.modality, ref.mm_hash, ref.raw_image_id, ) == ( - "rawtest", - "qwen_vl", + family, fingerprint, - "image", - "a" * 32, + expected_modality, + mm_hash, "image.png", ) - ref2 = split_raw_mm_ref(items[1]) - assert ( - ref2.run_id, - ref2.family, - ref2.fingerprint, - ref2.modality, - ref2.mm_hash, - ref2.raw_image_id, - ) == ( - "rawtest", - "qwen_vl", - fingerprint, - "image", - "b" * 32, - "image2.png", - ) assert result["multi_modal_data"] is mm_data diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py index 6b06add..b310809 100644 --- a/tests/test_multimodal.py +++ b/tests/test_multimodal.py @@ -138,6 +138,14 @@ def tiny_image(): return Image.new("RGB", (224, 224), color=(128, 192, 255)) +@pytest.fixture +def offloaded_tiny_image(tmp_path, tiny_image): + """Renderer-side image fixture: v1 renderers require offloaded file assets.""" + path = tmp_path / "tiny.png" + tiny_image.save(path) + return path.as_uri() + + # --------------------------------------------------------------------------- # Modality → (renderer-side content part, processor-side image-list builder). # Each modality has its own "make a content part" / "extract source images" @@ -451,7 +459,9 @@ def _supports_tool_message_images(renderer) -> bool: @pytest.mark.parametrize( "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES] ) -def test_multimodal_byte_parity_vs_processor(mm_model_name, modality, tiny_image): +def test_multimodal_byte_parity_vs_processor( + mm_model_name, modality, tiny_image, offloaded_tiny_image +): """Token byte-parity with ``processor.apply_chat_template`` + ``processor(...)``. Locks in the property that lets the inference engine see byte-identical @@ -464,8 +474,12 @@ def test_multimodal_byte_parity_vs_processor(mm_model_name, modality, tiny_image kit = _modality_kit(modality, mm_model_name) tokenizer, processor, renderer = _load_processor_and_renderer(mm_model_name) - for case in _build_cases(kit["make_part"], tiny_image): - messages, add_gp = case.values + renderer_cases = _build_cases(kit["make_part"], offloaded_tiny_image) + processor_cases = _build_cases(kit["make_part"], tiny_image) + for renderer_case, processor_case in zip(renderer_cases, processor_cases, strict=True): + messages, add_gp = renderer_case.values + processor_messages, processor_add_gp = processor_case.values + assert add_gp == processor_add_gp # Ours. ours = renderer.render_ids(messages, add_generation_prompt=add_gp) @@ -473,10 +487,10 @@ def test_multimodal_byte_parity_vs_processor(mm_model_name, modality, tiny_image # Theirs: family-specific processor call. Qwen-VL is a two-step # (apply_chat_template + processor(images=, text=)); Kimi K2.5 is # a one-shot processor(messages=). - theirs = kit["processor_input_ids"](processor, messages, add_gp) + theirs = kit["processor_input_ids"](processor, processor_messages, add_gp) assert ours == theirs, ( - f"{mm_model_name} / {modality} / case={case.id}: " + f"{mm_model_name} / {modality} / case={renderer_case.id}: " f"renderer diverges from processor.\n" f" ours[:80]={ours[:80]}\n theirs[:80]={theirs[:80]}\n" f" len(ours)={len(ours)} len(theirs)={len(theirs)}" @@ -486,7 +500,9 @@ def test_multimodal_byte_parity_vs_processor(mm_model_name, modality, tiny_image @pytest.mark.parametrize( "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES] ) -def test_multimodal_placeholders_match_pad_runs(mm_model_name, modality, tiny_image): +def test_multimodal_placeholders_match_pad_runs( + mm_model_name, modality, offloaded_tiny_image +): """``mm_placeholders`` exactly cover the runs of the modality's pad token.""" if not _hf_snapshot_cached(mm_model_name): pytest.skip(f"{mm_model_name}: HF snapshot not cached locally") @@ -495,7 +511,7 @@ def test_multimodal_placeholders_match_pad_runs(mm_model_name, modality, tiny_im tokenizer, _, renderer = _load_processor_and_renderer(mm_model_name) pad_id = tokenizer.convert_tokens_to_ids(kit["placeholder_token"]) - for case in _build_cases(kit["make_part"], tiny_image): + for case in _build_cases(kit["make_part"], offloaded_tiny_image): messages, add_gp = case.values rendered = renderer.render(messages, add_generation_prompt=add_gp) @@ -529,7 +545,7 @@ def test_multimodal_placeholders_match_pad_runs(mm_model_name, modality, tiny_im "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES] ) def test_multimodal_bridge_extends_and_carries_mm_data( - mm_model_name, modality, tiny_image + mm_model_name, modality, offloaded_tiny_image ): """Bridge-to-next-turn invariants for the multimodal case. @@ -567,7 +583,7 @@ def test_multimodal_bridge_extends_and_carries_mm_data( { "role": "user", "content": [ - kit["make_part"](tiny_image), + kit["make_part"](offloaded_tiny_image), {"type": "text", "text": "Turn one."}, ], } @@ -576,7 +592,7 @@ def test_multimodal_bridge_extends_and_carries_mm_data( { "role": "user", "content": [ - kit["make_part"](tiny_image), + kit["make_part"](offloaded_tiny_image), {"type": "text", "text": "Turn two."}, ], } @@ -665,7 +681,9 @@ def test_modality_registry_models_route_to_renderer(): @pytest.mark.parametrize( "mm_model_name,modality", _CASES, ids=[f"{m}|{mo}" for m, mo in _CASES] ) -def test_tool_response_image_byte_parity(mm_model_name, modality, tiny_image): +def test_tool_response_image_byte_parity( + mm_model_name, modality, tiny_image, offloaded_tiny_image +): """Tool-message image parity vs ``processor.apply_chat_template`` + ``processor(...)``. Browser-agent SFT traces carry post-action screenshots as ``tool`` @@ -688,12 +706,16 @@ def test_tool_response_image_byte_parity(mm_model_name, modality, tiny_image): f"{type(renderer).__name__} does not yet emit images inside tool responses" ) - for case in _build_tool_image_cases(kit["make_part"], tiny_image): - messages, add_gp = case.values + renderer_cases = _build_tool_image_cases(kit["make_part"], offloaded_tiny_image) + processor_cases = _build_tool_image_cases(kit["make_part"], tiny_image) + for renderer_case, processor_case in zip(renderer_cases, processor_cases, strict=True): + messages, add_gp = renderer_case.values + processor_messages, processor_add_gp = processor_case.values + assert add_gp == processor_add_gp ours = renderer.render_ids(messages, add_generation_prompt=add_gp) - theirs = kit["processor_input_ids"](processor, messages, add_gp) + theirs = kit["processor_input_ids"](processor, processor_messages, add_gp) assert ours == theirs, ( - f"{mm_model_name} / tool / case={case.id}: " + f"{mm_model_name} / tool / case={renderer_case.id}: " f"renderer diverges from processor.\n" f" len(ours)={len(ours)} len(theirs)={len(theirs)}\n" f" ours[:60]={ours[:60]}\n theirs[:60]={theirs[:60]}" @@ -750,7 +772,7 @@ def _qwen_vl_processor_input_ids_with_kwargs( ) @pytest.mark.parametrize("add_vision_id", [True, False]) def test_add_vision_id_parity_vs_processor( - mm_model_name, modality, add_vision_id, tiny_image + mm_model_name, modality, add_vision_id, tiny_image, offloaded_tiny_image ): """Parity for ``add_vision_id`` across image-bearing shapes. @@ -775,15 +797,19 @@ def test_add_vision_id_parity_vs_processor( if hasattr(renderer, "_processor") and renderer._processor is None: renderer._processor = processor - for case in _build_cases(kit["make_part"], tiny_image): - messages, add_gp = case.values + renderer_cases = _build_cases(kit["make_part"], offloaded_tiny_image) + processor_cases = _build_cases(kit["make_part"], tiny_image) + for renderer_case, processor_case in zip(renderer_cases, processor_cases, strict=True): + messages, add_gp = renderer_case.values + processor_messages, processor_add_gp = processor_case.values + assert add_gp == processor_add_gp ours = renderer.render_ids(messages, add_generation_prompt=add_gp) theirs = _qwen_vl_processor_input_ids_with_kwargs( - processor, messages, add_gp, add_vision_id=add_vision_id + processor, processor_messages, add_gp, add_vision_id=add_vision_id ) assert ours == theirs, ( f"{mm_model_name} / add_vision_id={add_vision_id} / " - f"case={case.id}: renderer diverges from processor.\n" + f"case={renderer_case.id}: renderer diverges from processor.\n" f" ours[:80]={ours[:80]}\n theirs[:80]={theirs[:80]}\n" f" len(ours)={len(ours)} len(theirs)={len(theirs)}" ) @@ -795,7 +821,7 @@ def test_add_vision_id_parity_vs_processor( ids=[f"{m}|{mo}" for m, mo in _ADD_VISION_ID_CASES], ) def test_bridge_refuses_when_add_vision_id_loses_prior_count( - mm_model_name, modality, tiny_image + mm_model_name, modality, offloaded_tiny_image ): """When ``add_vision_id=True``, the bridge needs the prior turn's image / video count to keep the ``Picture N:`` numbering correct. @@ -832,7 +858,7 @@ def test_bridge_refuses_when_add_vision_id_loses_prior_count( { "role": "user", "content": [ - kit["make_part"](tiny_image), + kit["make_part"](offloaded_tiny_image), {"type": "text", "text": "Turn one."}, ], } @@ -841,7 +867,7 @@ def test_bridge_refuses_when_add_vision_id_loses_prior_count( { "role": "user", "content": [ - kit["make_part"](tiny_image), + kit["make_part"](offloaded_tiny_image), {"type": "text", "text": "Turn two."}, ], } @@ -898,7 +924,7 @@ def test_is_image_part_treats_type_field_as_authoritative(): ``text: None`` added to every image part). The classifier must treat the ``type`` field as authoritative when present — falling back to a key-presence check on ``image_url`` would misclassify the text - part and the renderer would later raise on ``_load_pil_image(None)``. + part and the renderer would later try to resolve ``None`` as an image. """ from renderers.qwen3_vl import _is_image_part, _is_video_part diff --git a/tests/test_multimodal_image_layout_parity.py b/tests/test_multimodal_image_layout_parity.py deleted file mode 100644 index d9869ec..0000000 --- a/tests/test_multimodal_image_layout_parity.py +++ /dev/null @@ -1,125 +0,0 @@ -"""Image-layout descriptor parity against real HF processors.""" - -from __future__ import annotations - -import os -from pathlib import Path -from typing import Any - -import pytest - -from renderers.image_layout_specs import KIMI_K25_IMAGE_LAYOUT, QWEN_VL_IMAGE_LAYOUT -from renderers.kimi_k25 import describe_kimi_image_layout -from renderers.qwen3_vl import describe_qwen_image_layout - -pytest.importorskip("PIL", reason="Pillow required for image layout parity tests") -pytest.importorskip("torch", reason="torch required for image layout parity tests") -pytest.importorskip( - "transformers", reason="transformers required for image layout parity tests" -) - -from PIL import Image # noqa: E402 -from transformers import AutoProcessor # noqa: E402 - - -QWEN_MODEL = "Qwen/Qwen3-VL-4B-Instruct" -KIMI_MODEL = "moonshotai/Kimi-K2.5" -KIMI_REVISION = "4d01dfe0332d63057c186e0b262165819efb6611" - -IMAGE_SIZES = [(32, 32), (64, 256), (512, 512)] - - -def _hf_snapshot_cached(model_name: str) -> bool: - cache = ( - Path(os.environ.get("HF_HOME") or Path.home() / ".cache" / "huggingface") - / "hub" - ) - snapshots = cache / ("models--" + model_name.replace("/", "--")) / "snapshots" - return snapshots.is_dir() and any(p.is_dir() for p in snapshots.iterdir()) - - -def _load_processor(model_name: str, **kwargs: Any): - if not _hf_snapshot_cached(model_name): - pytest.skip(f"{model_name}: HF snapshot not cached locally") - return AutoProcessor.from_pretrained(model_name, **kwargs) - - -def _images(): - return [ - Image.new("RGB", size, color=(64 + idx * 32, 128, 192)) - for idx, size in enumerate(IMAGE_SIZES) - ] - - -def _tensor_rows(value: Any) -> list[list[int]]: - return [[int(cell) for cell in row] for row in value.tolist()] - - -def test_qwen_image_layout_descriptor_matches_processor(): - processor = _load_processor(QWEN_MODEL) - images = _images() - messages = [ - { - "role": "user", - "content": [ - item - for idx, image in enumerate(images) - for item in ( - {"type": "image", "image": image}, - {"type": "text", "text": f"image {idx}"}, - ) - ], - } - ] - - text = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - processor_grids = _tensor_rows( - processor(images=images, text=text, return_tensors="pt")["image_grid_thw"] - ) - descriptors = [ - describe_qwen_image_layout({"type": "image", "image": image}) - for image in images - ] - - assert [desc.image_grid_thw[0] for desc in descriptors] == processor_grids - merge_area = QWEN_VL_IMAGE_LAYOUT.merge_size**2 - assert [desc.num_image_tokens for desc in descriptors] == [ - grid_t * grid_h * grid_w // merge_area - for grid_t, grid_h, grid_w in processor_grids - ] - - -def test_kimi_image_layout_descriptor_matches_processor(): - processor = _load_processor( - KIMI_MODEL, trust_remote_code=True, revision=KIMI_REVISION - ) - images = _images() - messages = [ - { - "role": "user", - "content": [ - item - for idx, image in enumerate(images) - for item in ( - {"type": "image_url", "image_url": image}, - {"type": "text", "text": f"image {idx}"}, - ) - ], - } - ] - - out = processor(messages=messages, add_generation_prompt=True, return_tensors="pt") - processor_grids = _tensor_rows(out["grid_thws"]) - descriptors = [ - describe_kimi_image_layout({"type": "image_url", "image_url": image}) - for image in images - ] - - assert [desc.grid_thws[0] for desc in descriptors] == processor_grids - merge_area = KIMI_K25_IMAGE_LAYOUT.merge_kernel_size**2 - assert [desc.num_media_tokens for desc in descriptors] == [ - grid_t * grid_h * grid_w // merge_area - for grid_t, grid_h, grid_w in processor_grids - ] From af84c192df5a8cbaa5fa51b1ee5c85fdd8d8e5e2 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Mon, 29 Jun 2026 16:57:42 +0000 Subject: [PATCH 10/16] Clarify raw image asset contract --- renderers/mm_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 53c69dc..1ffc5f4 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -17,6 +17,7 @@ from dataclasses import dataclass from pathlib import Path +# Contract: must match prime_rl.utils.run_assets.IMAGE_OFFLOAD_DIR_ENV. IMAGE_OFFLOAD_DIR_ENV = "VF_RENDERER_IMAGE_OFFLOAD_DIR" IMAGE_REF_PREFIX = "mmraw" @@ -200,7 +201,7 @@ def raw_mm_ref( _ensure_safe("image layout fingerprint", fingerprint) _ensure_safe("raw multimodal modality", modality) _ensure_safe("image hash", mm_hash) - raw_image_path(raw_image_id=raw_image_id) + raw_image_id = _ensure_safe("raw image id", raw_image_id) ref_payload: dict[str, object] = { "family": family, From 4e3502fcd3511871a69ba80c2004fc9e88961a1f Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Mon, 29 Jun 2026 17:03:28 +0000 Subject: [PATCH 11/16] Apply ruff formatting --- renderers/base.py | 1 - renderers/configs.py | 1 + renderers/qwen35.py | 1 - tests/test_multimodal.py | 12 +++++++++--- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/renderers/base.py b/renderers/base.py index 1e1e9da..a442e61 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -969,7 +969,6 @@ def bridge_to_next_turn(self, *args: Any, **kwargs: Any) -> "RenderedTokens | No with self.checkout() as r: return r.bridge_to_next_turn(*args, **kwargs) - # ``mm_token_type_id_map`` (the MultimodalRenderer protocol attribute) # is set in ``__init__`` only for pools wrapping multimodal renderers; # see the comment there for why this isn't a class-level property. diff --git a/renderers/configs.py b/renderers/configs.py index efa52d8..d5be88c 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -225,6 +225,7 @@ def _check_thinking_retention(self): ) return self + class Qwen3VLRendererConfig(BaseRendererConfig): """Qwen3-VL renderer config.""" diff --git a/renderers/qwen35.py b/renderers/qwen35.py index a73cfb7..49008dc 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -162,7 +162,6 @@ def mm_token_type_id_map(self) -> dict[int, int]: """ return {self._image_pad: 1, self._video_pad: 2} - @staticmethod def _content_has_media(content: Any) -> bool: """True when ``content`` is a structured list containing image / video parts.""" diff --git a/tests/test_multimodal.py b/tests/test_multimodal.py index b310809..1a20e45 100644 --- a/tests/test_multimodal.py +++ b/tests/test_multimodal.py @@ -476,7 +476,9 @@ def test_multimodal_byte_parity_vs_processor( renderer_cases = _build_cases(kit["make_part"], offloaded_tiny_image) processor_cases = _build_cases(kit["make_part"], tiny_image) - for renderer_case, processor_case in zip(renderer_cases, processor_cases, strict=True): + for renderer_case, processor_case in zip( + renderer_cases, processor_cases, strict=True + ): messages, add_gp = renderer_case.values processor_messages, processor_add_gp = processor_case.values assert add_gp == processor_add_gp @@ -708,7 +710,9 @@ def test_tool_response_image_byte_parity( renderer_cases = _build_tool_image_cases(kit["make_part"], offloaded_tiny_image) processor_cases = _build_tool_image_cases(kit["make_part"], tiny_image) - for renderer_case, processor_case in zip(renderer_cases, processor_cases, strict=True): + for renderer_case, processor_case in zip( + renderer_cases, processor_cases, strict=True + ): messages, add_gp = renderer_case.values processor_messages, processor_add_gp = processor_case.values assert add_gp == processor_add_gp @@ -799,7 +803,9 @@ def test_add_vision_id_parity_vs_processor( renderer_cases = _build_cases(kit["make_part"], offloaded_tiny_image) processor_cases = _build_cases(kit["make_part"], tiny_image) - for renderer_case, processor_case in zip(renderer_cases, processor_cases, strict=True): + for renderer_case, processor_case in zip( + renderer_cases, processor_cases, strict=True + ): messages, add_gp = renderer_case.values processor_messages, processor_add_gp = processor_case.values assert add_gp == processor_add_gp From 998e1db2c40bf51783134acb5443dcd7c982cc4d Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Mon, 29 Jun 2026 17:37:43 +0000 Subject: [PATCH 12/16] Preserve multimodal sidecar for prebuilt prompts --- renderers/client.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index 6132ad1..14ec73b 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -213,12 +213,14 @@ async def generate( def _prepare(): if prompt_ids is not None: # Caller-supplied prompt; if they also gave us pre-computed - # attribution (e.g. the bridge path in verifiers), thread it - # through unchanged. + # attribution (e.g. the bridge path in verifiers), thread it through. + prompt_mm_data = multi_modal_data + if prompt_mm_data is None and prompt_attribution is not None: + prompt_mm_data = prompt_attribution.multi_modal_data return ( list(prompt_ids), renderer.get_stop_token_ids(), - multi_modal_data, + prompt_mm_data, prompt_attribution, ) rendered = renderer.render(messages, tools=tools, add_generation_prompt=True) @@ -261,10 +263,7 @@ def _features_and_descriptor_mm() -> tuple[ return _build_vllm_mm_features(mm_data), mm_data features, out_mm_data = await _maybe_offload(renderer, _features_and_descriptor_mm) - if ( - prompt_attr is not None - and getattr(prompt_attr, "multi_modal_data", None) is not None - ): + if prompt_attr is not None and out_mm_data is not None: prompt_attr = replace(prompt_attr, multi_modal_data=out_mm_data) if features is not None: body["features"] = features From 2a19d759860f46eb269a56b6f69f5efe0f38b7a1 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Mon, 29 Jun 2026 19:43:10 +0000 Subject: [PATCH 13/16] Use URI-based raw image refs --- renderers/client.py | 8 ++++---- renderers/kimi_k25.py | 10 +++++----- renderers/mm_store.py | 40 +++++++++++++--------------------------- renderers/qwen3_vl.py | 10 +++++----- tests/test_client.py | 13 +++++++------ 5 files changed, 34 insertions(+), 47 deletions(-) diff --git a/renderers/client.py b/renderers/client.py index 14ec73b..73fe5c5 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -388,12 +388,12 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: if not isinstance(feature_modality, str) or not feature_modality: raise ValueError("raw multimodal item has invalid vllm_modality") - raw_image_id = item.get("raw_image_id") + raw_image_uri = item.get("raw_image_uri") family = item.get("family") fingerprint = item.get("layout_fingerprint") payload = item.get("payload") - if not isinstance(raw_image_id, str) or not raw_image_id: - raise ValueError("raw multimodal item is missing raw_image_id") + if not isinstance(raw_image_uri, str) or not raw_image_uri: + raise ValueError("raw multimodal item is missing raw_image_uri") if not isinstance(family, str) or not family: raise ValueError("raw multimodal item is missing family") if not isinstance(fingerprint, str) or not fingerprint: @@ -411,7 +411,7 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: fingerprint=fingerprint, modality=feature_modality, mm_hash=mm_hashes[idx], - raw_image_id=raw_image_id, + raw_image_uri=raw_image_uri, payload=payload, ) ) diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index 741c8ba..ebd5f04 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -45,6 +45,7 @@ trim_to_turn_close, ) from renderers.configs import KimiK25RendererConfig +from renderers.mm_store import image_layout_fingerprint, raw_mm_item from renderers.parsing import _reasoning_end_token_index, parse_kimi_k2_section from renderers.qwen3_vl import ( _image_content_hash, @@ -52,9 +53,8 @@ _image_source, _is_image_part, _is_video_part, - _raw_image_id, + _raw_image_uri, ) -from renderers.mm_store import image_layout_fingerprint, raw_mm_item # --------------------------------------------------------------------------- # Constants @@ -430,7 +430,7 @@ class KimiImageLayoutDescriptor: grid_thws: list[list[int]] num_media_tokens: int fingerprint: str - raw_image_id: str + raw_image_uri: str def _ceil_to_factor(value: int, factor: int) -> int: @@ -486,7 +486,7 @@ def describe_kimi_image_layout(part: dict[str, Any]) -> KimiImageLayoutDescripto grid_thws=grid_thws, num_media_tokens=num_media_tokens, fingerprint=fingerprint, - raw_image_id=_raw_image_id(source), + raw_image_uri=_raw_image_uri(source), ) @@ -500,7 +500,7 @@ def kimi_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str "grid_thws": desc.grid_thws, "num_media_tokens": desc.num_media_tokens, }, - raw_image_id=desc.raw_image_id, + raw_image_uri=desc.raw_image_uri, vllm_modality=KIMI_K25_VLLM_MODALITY, ) return 1, desc.mm_hash, item diff --git a/renderers/mm_store.py b/renderers/mm_store.py index 1ffc5f4..af91a4b 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -21,7 +21,6 @@ IMAGE_OFFLOAD_DIR_ENV = "VF_RENDERER_IMAGE_OFFLOAD_DIR" IMAGE_REF_PREFIX = "mmraw" -IMAGE_REF_VERSION = "v3" RAW_MM_ITEM_KIND = "prime_raw_mm_item" RAW_MM_ITEM_VERSION = 1 @@ -30,7 +29,6 @@ "raw multimodal modality": re.compile(r"^[A-Za-z0-9_.-]+$"), "image layout fingerprint": re.compile(r"^[a-f0-9]{16,64}$"), "image hash": re.compile(r"^[a-f0-9]{16,128}$"), - "raw image id": re.compile(r"^[A-Za-z0-9_.-]+$"), "raw multimodal ref payload segment": re.compile(r"^[A-Za-z0-9_-]*$"), } @@ -100,15 +98,6 @@ def offload_image_to_run_assets( return path.as_uri() -def raw_image_path(*, raw_image_id: str) -> Path: - _ensure_safe("raw image id", raw_image_id) - root = run_image_dir() - path = (root / raw_image_id).resolve() - if not path.is_relative_to(root): - raise ValueError(f"Raw image path escaped root: {path}") - return path - - def _json_fingerprint_value(value: object) -> str: return json.dumps(value, sort_keys=True, separators=(",", ":"), default=str) @@ -147,7 +136,7 @@ def raw_mm_item( family: str, layout_fingerprint: str, payload: dict[str, object], - raw_image_id: str, + raw_image_uri: str, vllm_modality: str | None = None, ) -> dict[str, object]: """Build the JSON-safe raw multimodal descriptor envelope. @@ -169,7 +158,7 @@ def raw_mm_item( } if vllm_modality is not None: out["vllm_modality"] = vllm_modality - out["raw_image_id"] = _ensure_safe("raw image id", raw_image_id) + out["raw_image_uri"] = raw_image_uri return out @@ -180,7 +169,7 @@ class RawMMRef: modality: str mm_hash: str payload: dict[str, object] - raw_image_id: str + raw_image_uri: str def raw_mm_ref( @@ -189,7 +178,7 @@ def raw_mm_ref( fingerprint: str, modality: str, mm_hash: str, - raw_image_id: str, + raw_image_uri: str, payload: dict[str, object] | None = None, ) -> str: """Generic raw multimodal asset ref. @@ -201,7 +190,6 @@ def raw_mm_ref( _ensure_safe("image layout fingerprint", fingerprint) _ensure_safe("raw multimodal modality", modality) _ensure_safe("image hash", mm_hash) - raw_image_id = _ensure_safe("raw image id", raw_image_id) ref_payload: dict[str, object] = { "family": family, @@ -209,23 +197,23 @@ def raw_mm_ref( "modality": modality, "mm_hash": mm_hash, "payload": payload or {}, - "raw_image_id": raw_image_id, + "raw_image_uri": raw_image_uri, } - return f"{IMAGE_REF_PREFIX}:{IMAGE_REF_VERSION}:{_encode_ref_payload(ref_payload)}" + return f"{IMAGE_REF_PREFIX}:{_encode_ref_payload(ref_payload)}" def split_raw_mm_ref(ref: str) -> RawMMRef: parts = ref.split(":") - if parts[:2] != [IMAGE_REF_PREFIX, IMAGE_REF_VERSION] or len(parts) != 3: + if len(parts) != 2 or parts[0] != IMAGE_REF_PREFIX: raise ValueError(f"Invalid raw multimodal ref shape: {ref!r}") - payload = _decode_ref_payload(parts[2]) + payload = _decode_ref_payload(parts[1]) family = payload.get("family") fingerprint = payload.get("fingerprint") modality = payload.get("modality") mm_hash = payload.get("mm_hash") - raw_image_id = payload.get("raw_image_id") + raw_image_uri = payload.get("raw_image_uri") item_payload = payload.get("payload") if not isinstance(family, str): @@ -236,8 +224,8 @@ def split_raw_mm_ref(ref: str) -> RawMMRef: raise ValueError("Raw multimodal ref is missing modality") if not isinstance(mm_hash, str): raise ValueError("Raw multimodal ref is missing mm_hash") - if not isinstance(raw_image_id, str): - raise ValueError("Raw multimodal ref is missing raw_image_id") + if not isinstance(raw_image_uri, str): + raise ValueError("Raw multimodal ref is missing raw_image_uri") if not isinstance(item_payload, dict): raise ValueError("Raw multimodal ref payload must be a dict") @@ -247,11 +235,9 @@ def split_raw_mm_ref(ref: str) -> RawMMRef: modality=_ensure_safe("raw multimodal modality", modality), mm_hash=_ensure_safe("image hash", mm_hash), payload=item_payload, - raw_image_id=_ensure_safe("raw image id", raw_image_id), + raw_image_uri=raw_image_uri, ) def is_raw_mm_ref(ref: object) -> bool: - return isinstance(ref, str) and ref.startswith( - f"{IMAGE_REF_PREFIX}:{IMAGE_REF_VERSION}:" - ) + return isinstance(ref, str) and ref.startswith(f"{IMAGE_REF_PREFIX}:") diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 961d145..e780ed7 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -119,7 +119,7 @@ class QwenImageLayoutDescriptor: image_grid_thw: list[list[int]] num_image_tokens: int fingerprint: str - raw_image_id: str + raw_image_uri: str def _smart_resize( @@ -196,8 +196,8 @@ def _image_content_hash(source: Any) -> str: return hashlib.sha256(_offloaded_image_path(source).read_bytes()).hexdigest()[:32] -def _raw_image_id(source: Any) -> str: - return _offloaded_image_path(source).name +def _raw_image_uri(source: Any) -> str: + return _offloaded_image_path(source).as_uri() def describe_qwen_image_layout(part: dict[str, Any]) -> QwenImageLayoutDescriptor: @@ -231,7 +231,7 @@ def describe_qwen_image_layout(part: dict[str, Any]) -> QwenImageLayoutDescripto image_grid_thw=[[grid_t, grid_h, grid_w]], num_image_tokens=num_image_tokens, fingerprint=fingerprint, - raw_image_id=_raw_image_id(source), + raw_image_uri=_raw_image_uri(source), ) @@ -242,7 +242,7 @@ def qwen_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str family="qwen_vl", layout_fingerprint=desc.fingerprint, payload={"image_grid_thw": desc.image_grid_thw}, - raw_image_id=desc.raw_image_id, + raw_image_uri=desc.raw_image_uri, ) return desc.num_image_tokens, desc.mm_hash, item diff --git a/tests/test_client.py b/tests/test_client.py index 967fe6d..ac60a27 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -310,7 +310,7 @@ def test_generate_threads_prompt_attribution_through_prebuilt_prompt_path(): ids=["default_image_modality", "kimi_vllm_modality"], ) def test_generate_serializes_raw_mm_refs( - tmp_path, monkeypatch, family, payload, expected_modality, vllm_modality + tmp_path, family, payload, expected_modality, vllm_modality ): """``generate`` serializes raw multimodal envelopes to vLLM refs. @@ -329,8 +329,9 @@ def test_generate_serializes_raw_mm_refs( image_dir = tmp_path / "run_rawtest" / "assets" / "images" image_dir.mkdir(parents=True) - (image_dir / "image.png").write_bytes(b"image-bytes") - monkeypatch.setenv("VF_RENDERER_IMAGE_OFFLOAD_DIR", str(image_dir)) + image_path = image_dir / "image.png" + image_path.write_bytes(b"image-bytes") + image_uri = image_path.as_uri() fingerprint = image_layout_fingerprint(family=family, revision="test") mm_hash = "a" * 32 @@ -348,7 +349,7 @@ def test_generate_serializes_raw_mm_refs( family=family, layout_fingerprint=fingerprint, payload=payload, - raw_image_id="image.png", + raw_image_uri=image_uri, vllm_modality=vllm_modality, ), ], @@ -384,13 +385,13 @@ def test_generate_serializes_raw_mm_refs( ref.fingerprint, ref.modality, ref.mm_hash, - ref.raw_image_id, + ref.raw_image_uri, ) == ( family, fingerprint, expected_modality, mm_hash, - "image.png", + image_uri, ) assert result["multi_modal_data"] is mm_data From aa2d44d06a40ee31aaa68acd2dae072a8dcca585 Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Mon, 29 Jun 2026 20:50:17 +0000 Subject: [PATCH 14/16] Drop raw multimodal version markers --- renderers/mm_store.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/renderers/mm_store.py b/renderers/mm_store.py index af91a4b..ce07ab9 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -22,7 +22,6 @@ IMAGE_REF_PREFIX = "mmraw" RAW_MM_ITEM_KIND = "prime_raw_mm_item" -RAW_MM_ITEM_VERSION = 1 _SAFE = { "multimodal family": re.compile(r"^[A-Za-z0-9_.-]+$"), @@ -126,7 +125,7 @@ def image_layout_fingerprint(*, family: str, **values: object) -> str: encoded_values = ":".join( f"{key}={_json_fingerprint_value(values[key])}" for key in sorted(values) ) - raw = f"image-layout:v1:{family}:{encoded_values}".encode("utf-8") + raw = f"image-layout:{family}:{encoded_values}".encode("utf-8") return hashlib.sha256(raw).hexdigest()[:32] @@ -150,7 +149,6 @@ def raw_mm_item( _ensure_safe("image layout fingerprint", layout_fingerprint) out: dict[str, object] = { "kind": RAW_MM_ITEM_KIND, - "version": RAW_MM_ITEM_VERSION, "modality": modality, "family": family, "layout_fingerprint": layout_fingerprint, From ed5b404edf690440b9845a974bb097d9458c308a Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Tue, 30 Jun 2026 18:15:33 +0000 Subject: [PATCH 15/16] Support processed multimodal renderer output --- pyproject.toml | 7 ++ renderers/__init__.py | 2 + renderers/base.py | 13 ++- renderers/client.py | 3 +- renderers/configs.py | 17 +++- renderers/kimi_k25.py | 76 ++++++++++++++- renderers/mm_store.py | 8 +- renderers/qwen35.py | 29 +++++- renderers/qwen3_vl.py | 129 ++++++++++++++++++++++++-- tests/test_multimodal_output_modes.py | 71 ++++++++++++++ tests/test_renderer_config.py | 5 +- uv.lock | 51 ++++++---- 12 files changed, 359 insertions(+), 52 deletions(-) create mode 100644 tests/test_multimodal_output_modes.py diff --git a/pyproject.toml b/pyproject.toml index dc75397..3db0939 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,13 @@ dependencies = [ "prime-pydantic-config>=0.3.0.dev83", ] +[project.optional-dependencies] +vision = [ + "pillow>=12.2.0", + "torch>=2.11.0", + "torchvision>=0.26.0", +] + [tool.hatch.version] source = "vcs" # Tags look like ``renderers-v0.1.8`` (prefix matches the publish.yml diff --git a/renderers/__init__.py b/renderers/__init__.py index 9fd385e..bb8e6ba 100644 --- a/renderers/__init__.py +++ b/renderers/__init__.py @@ -55,6 +55,7 @@ LagunaXS2RendererConfig, Llama3RendererConfig, MiniMaxM2RendererConfig, + MultimodalOutput, Nemotron3RendererConfig, Nemotron3UltraRendererConfig, Qwen35RendererConfig, @@ -144,6 +145,7 @@ def __dir__() -> list[str]: "Message", "MiniMaxM2Renderer", "MiniMaxM2RendererConfig", + "MultimodalOutput", "MultiModalData", "MultimodalRenderer", "Nemotron3Renderer", diff --git a/renderers/base.py b/renderers/base.py index a442e61..81d3d60 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -205,11 +205,10 @@ class PlaceholderRange: class MultiModalData: """Multimodal sidecar produced alongside the token stream. - Renderer output is framework-agnostic: ``mm_items[modality][i]`` is a - plain raw descriptor envelope with a model-family key and an adapter-owned - payload. Translation to engine-specific wire formats — vLLM image refs, - SGLang payloads, etc. — happens in the inference glue layer (see - ``renderers.client``). + ``mm_items[modality][i]`` follows the renderer's configured + ``multimodal_output``. The default ``"raw"`` mode emits JSON-safe image + descriptor envelopes for inference paths. ``"processed"`` emits + image-processor payloads such as ``pixel_values`` for SFT/training paths. """ mm_hashes: dict[str, list[str]] = field(default_factory=dict) @@ -1474,7 +1473,7 @@ def _resolve_auto_config( model_name = getattr(tokenizer, "name_or_path", "") renderer_name = MODEL_RENDERER_MAP.get(model_name) - preserve_carry = {} + preserve_carry: dict[str, Any] = {"multimodal_output": auto.multimodal_output} if auto.thinking_retention is not None: preserve_carry["thinking_retention"] = auto.thinking_retention @@ -1525,7 +1524,7 @@ def _resolve_auto_config( "reasoning_parser=...) to enable structured output parsing.", model_name or "", ) - return DefaultRendererConfig() + return DefaultRendererConfig(multimodal_output=auto.multimodal_output) # --------------------------------------------------------------------------- diff --git a/renderers/client.py b/renderers/client.py index 73fe5c5..c8015da 100644 --- a/renderers/client.py +++ b/renderers/client.py @@ -381,7 +381,8 @@ def _build_vllm_mm_features(mm_data: MultiModalData) -> dict[str, Any]: for idx, item in enumerate(items): if item.get("kind") != RAW_MM_ITEM_KIND: raise NotImplementedError( - "Multimodal serialization requires raw descriptor envelopes; " + "renderers.client.generate() requires raw multimodal " + "descriptor envelopes (multimodal_output='raw'); " f"got item keys {sorted(item)} for modality {source_modality!r}." ) feature_modality = item.get("vllm_modality") or source_modality diff --git a/renderers/configs.py b/renderers/configs.py index d5be88c..5be3362 100644 --- a/renderers/configs.py +++ b/renderers/configs.py @@ -52,6 +52,9 @@ def _reject_thinking_retention_conflict( ThinkingRetention = Literal["tool_cycle", "all"] """User-facing historical thinking/analysis retention override.""" +MultimodalOutput = Literal["raw", "processed"] +"""Renderer multimodal sidecar format.""" + ResolvedThinkingRetention = Literal["template", "tool_cycle", "all"] """Internal bridge policy after template kwargs have been resolved.""" @@ -87,6 +90,12 @@ class BaseRendererConfig(BaseConfig): to the Python chat-template implementation and its explicit template kwargs.""" + multimodal_output: MultimodalOutput = "raw" + """Multimodal sidecar format: + + - ``"raw"`` — emit JSON-safe image refs/descriptors for inference paths. + - ``"processed"`` — emit image-processor payloads for SFT/training paths.""" + # Fields that are renderer-internal — not forwarded to (or mirrored # by) ``apply_chat_template``. Override in subclasses that hold # non-template config (e.g. GptOss's @@ -116,10 +125,9 @@ def template_field_names(cls) -> frozenset[str]: class AutoRendererConfig(BaseRendererConfig): """Resolve the renderer from ``tokenizer.name_or_path`` at construction - time via ``MODEL_RENDERER_MAP``. Carries only the shared - ``thinking_retention`` field when explicitly set; template kwargs require - an explicit renderer choice so template-dependent behaviour stays visible - at the call site.""" + time via ``MODEL_RENDERER_MAP``. Carries the shared base fields into the + concrete renderer config; template kwargs require an explicit renderer + choice so template-dependent behaviour stays visible at the call site.""" name: Literal["auto"] = "auto" @@ -640,6 +648,7 @@ def config_from_name(name: str) -> BaseRendererConfig | None: "LagunaXS2RendererConfig", "Llama3RendererConfig", "MiniMaxM2RendererConfig", + "MultimodalOutput", "Nemotron3RendererConfig", "Nemotron3UltraRendererConfig", "Qwen35RendererConfig", diff --git a/renderers/kimi_k25.py b/renderers/kimi_k25.py index ebd5f04..4fb576c 100644 --- a/renderers/kimi_k25.py +++ b/renderers/kimi_k25.py @@ -53,6 +53,8 @@ _image_source, _is_image_part, _is_video_part, + _load_pil_image, + _pil_image_hash, _raw_image_uri, ) @@ -506,6 +508,58 @@ def kimi_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str return 1, desc.mm_hash, item +def load_kimi_processor(tokenizer): + try: + from transformers import AutoProcessor + except ImportError as exc: + raise RuntimeError( + "Processed multimodal rendering requires transformers with " + "AutoProcessor support." + ) from exc + + name = getattr(tokenizer, "name_or_path", None) + if not name: + raise RuntimeError( + "KimiK25Renderer needs a processor for multimodal_output='processed'. " + "Inject `renderer._processor` or load the tokenizer with a known " + "name_or_path." + ) + + from renderers.base import TRUSTED_REVISIONS + + kwargs: dict[str, Any] = {"trust_remote_code": True} + revision = TRUSTED_REVISIONS.get(name) + if revision is not None: + kwargs["revision"] = revision + return AutoProcessor.from_pretrained(name, **kwargs) + + +def kimi_processed_image_item_for_render( + part: dict[str, Any], + *, + processor: Any, + image_cache: dict[str, tuple[Any, int]], +) -> tuple[int, str, dict[str, Any]]: + pil = _load_pil_image(part) + image_hash = _pil_image_hash(pil) + cached = image_cache.get(image_hash) + if cached is not None: + out, _num_patches = cached + else: + img_proc = processor.image_processor + media_item = {"type": "image", "image": pil} + out = img_proc.preprocess([media_item], return_tensors="np") + num_patches = int(img_proc.media_tokens_calculator(media_item)) + if len(image_cache) >= 256: + image_cache.pop(next(iter(image_cache))) + image_cache[image_hash] = (out, num_patches) + item = { + "pixel_values": out["pixel_values"], + "grid_thws": out["grid_thws"], + } + return 1, image_hash, item + + # --------------------------------------------------------------------------- # Kimi K2.5 response parsing (mirrors K2 format, same token structure) # --------------------------------------------------------------------------- @@ -700,6 +754,8 @@ def __init__( config: KimiK25RendererConfig | None = None, ): self._tokenizer = tokenizer + self._processor: Any = None + self._image_cache: dict[str, tuple[Any, int]] = {} self.config = config or KimiK25RendererConfig() self.effective_thinking_retention = resolve_thinking_retention( self.config, @@ -769,6 +825,22 @@ def _encode(self, text: str) -> list[int]: return [] return self._tokenizer.encode(text, add_special_tokens=False) + def _get_processor(self): + if self._processor is None: + self._processor = load_kimi_processor(self._tokenizer) + return self._processor + + def _image_item_for_render( + self, part: dict[str, Any] + ) -> tuple[int, str, dict[str, Any]]: + if self.config.multimodal_output == "processed": + return kimi_processed_image_item_for_render( + part, + processor=self._get_processor(), + image_cache=self._image_cache, + ) + return kimi_image_item_for_render(part) + # ------------------------------------------------------------------ # Core render # ------------------------------------------------------------------ @@ -866,7 +938,7 @@ def emit_image( ``<|media_content|>``, ``<|media_end|>``, the trailing ``\\n``) are template-injected scaffold. """ - _placeholder_len, h, mm_item = kimi_image_item_for_render(part) + _placeholder_len, h, mm_item = self._image_item_for_render(part) emit_special( self._media_begin, msg_idx, is_sampled=is_sampled, is_content=False ) @@ -1151,7 +1223,7 @@ def emit_image( is_sampled: bool = False, is_content: bool = False, ) -> None: - _placeholder_len, h, mm_item = kimi_image_item_for_render(part) + _placeholder_len, h, mm_item = self._image_item_for_render(part) emit_special(self._media_begin, msg_idx) emit_text("image", msg_idx) emit_special(self._media_content, msg_idx) diff --git a/renderers/mm_store.py b/renderers/mm_store.py index ce07ab9..ddeb9bf 100644 --- a/renderers/mm_store.py +++ b/renderers/mm_store.py @@ -1,9 +1,9 @@ """Run-scoped image asset helpers for multimodal rendering. -The renderer stack does not ship processed multimodal features. Images are -written once into the run output tree and messages carry ``file://`` URLs to -those files. Renderers then emit lightweight image refs for vLLM only when the -engine needs to process an image. +The default renderer multimodal mode does not ship processed image features. +Images are written once into the run output tree and messages carry ``file://`` +URLs to those files. Renderers then emit lightweight image refs for vLLM only +when the engine needs to process an image. """ from __future__ import annotations diff --git a/renderers/qwen35.py b/renderers/qwen35.py index 49008dc..ccce03f 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -8,8 +8,9 @@ ``ImagePart``, the renderer emits the same ``<|vision_start|>``+N×``<|image_pad|>`` +``<|vision_end|>`` expansion as the HF chat template (``N = image_grid_thw.prod() // merge_size**2``) using the renderer's baked image -layout spec. It does not call the HF image processor; vLLM receives run image refs -for images it must process. +layout spec. By default, vLLM receives run image refs for images it must +process; ``multimodal_output="processed"`` emits image-processor payloads for +SFT/training callers. """ from __future__ import annotations @@ -38,7 +39,9 @@ from renderers.qwen3_vl import ( _is_image_part, _is_video_part, + load_qwen_processor, qwen_image_item_for_render, + qwen_processed_image_item_for_render, ) # --------------------------------------------------------------------------- @@ -118,6 +121,8 @@ def __init__( config: Qwen35RendererConfig | None = None, ): self._tokenizer = tokenizer + self._processor: Any = None + self._image_cache: dict[str, tuple[Any, int]] = {} cfg = config or type(self)._config_cls() # ``enable_thinking=None`` defers to the model's known default (see # ``_ENABLE_THINKING_DEFAULTS``). Materialise here so downstream reads @@ -184,6 +189,22 @@ def _encode(self, text: str) -> list[int]: return [] return self._tokenizer.encode(text, add_special_tokens=False) + def _get_processor(self): + if self._processor is None: + self._processor = load_qwen_processor(self._tokenizer, type(self).__name__) + return self._processor + + def _image_item_for_render( + self, part: dict[str, Any] + ) -> tuple[int, str, dict[str, Any]]: + if self.config.multimodal_output == "processed": + return qwen_processed_image_item_for_render( + part, + processor=self._get_processor(), + image_cache=self._image_cache, + ) + return qwen_image_item_for_render(part) + # ------------------------------------------------------------------ # Content rendering (mirrors the render_content Jinja macro) # ------------------------------------------------------------------ @@ -329,7 +350,7 @@ def emit_image(part: dict[str, Any], msg_idx: int) -> None: # image data, so they ARE body content (is_content=True); # the surrounding ``<|vision_start|>`` / ``<|vision_end|>`` # specials are template scaffold. - n, h, mm_item = qwen_image_item_for_render(part) + n, h, mm_item = self._image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: emit_text( @@ -675,7 +696,7 @@ def emit_text_segments( content_mask.append(is_content) def emit_image(part: dict[str, Any], msg_idx: int = -1) -> None: - n, h, mm_item = qwen_image_item_for_render(part) + n, h, mm_item = self._image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: emit_text(f"Picture {vision_counts['image']}: ", msg_idx) diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index e780ed7..2e424ff 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -6,11 +6,10 @@ for image inputs as the HF processor (``N = image_grid_thw.prod() // merge_size**2``). -Image data is shipped to the inference engine via run image refs, not -processed image-processor payloads. ``RenderedTokens.multi_modal_data`` -records placeholder spans, stable image hashes, and Qwen layout metadata -(``image_grid_thw``) so vLLM can cache-match prior images and process new -image refs itself. +By default, image data is shipped to the inference engine via run image refs, +not processed image-processor payloads. ``multimodal_output="processed"`` +instead emits processor payloads for SFT/training callers that need +``pixel_values`` directly. BPE boundary discipline: text runs that the chat template emits contiguously (e.g. ``"user\\n" + content_text``) must be encoded as a @@ -21,12 +20,14 @@ also can't merge across. Video-shaped content parts are detected and rejected explicitly; video -materialization is not implemented in this raw-image path yet. +materialization is not implemented yet. """ from __future__ import annotations +import base64 import hashlib +import io import json import math from dataclasses import dataclass @@ -111,6 +112,7 @@ class QwenVLImageLayoutSpec: QWEN_VL_IMAGE_LAYOUT = QwenVLImageLayoutSpec() +_PROCESSED_IMAGE_CACHE_MAX = 256 @dataclass(frozen=True) @@ -180,6 +182,47 @@ def _offloaded_image_path(source: Any) -> Path: return path +def _load_pil_image(item: dict[str, Any]): + """Resolve an ImagePart to a PIL Image for processed multimodal output.""" + try: + from PIL import Image + except ImportError as exc: + raise RuntimeError( + "Processed multimodal rendering requires Pillow. Install " + "`renderers[vision]` or provide Pillow in the caller environment." + ) from exc + + raw = _image_source(item) + if isinstance(raw, Image.Image): + return raw.convert("RGB") if raw.mode != "RGB" else raw + + if isinstance(raw, (bytes, bytearray)): + return Image.open(io.BytesIO(raw)).convert("RGB") + + if not isinstance(raw, str): + raise TypeError( + f"Unsupported image source {type(raw).__name__!r}; expected PIL " + "Image, bytes, path, http(s):// URL, file:// URL, or data: URI." + ) + + if raw.startswith("data:"): + _, _, payload = raw.partition(",") + return Image.open(io.BytesIO(base64.b64decode(payload))).convert("RGB") + + parsed = urlparse(raw) + if parsed.scheme in ("http", "https"): + import urllib.request + + with urllib.request.urlopen(raw) as resp: # noqa: S310 + return Image.open(io.BytesIO(resp.read())).convert("RGB") + + if parsed.scheme in ("file", ""): + path = unquote(parsed.path) if parsed.scheme == "file" else raw + return Image.open(path).convert("RGB") + + raise ValueError(f"Unsupported image URL scheme: {parsed.scheme!r} in {raw!r}") + + def _image_dimensions(source: Any) -> tuple[int, int]: try: from PIL import Image @@ -196,6 +239,13 @@ def _image_content_hash(source: Any) -> str: return hashlib.sha256(_offloaded_image_path(source).read_bytes()).hexdigest()[:32] +def _pil_image_hash(pil_image) -> str: + h = hashlib.sha256() + h.update(pil_image.tobytes()) + h.update(f"{pil_image.size}".encode()) + return h.hexdigest()[:32] + + def _raw_image_uri(source: Any) -> str: return _offloaded_image_path(source).as_uri() @@ -247,6 +297,51 @@ def qwen_image_item_for_render(part: dict[str, Any]) -> tuple[int, str, dict[str return desc.num_image_tokens, desc.mm_hash, item +def load_qwen_processor(tokenizer, renderer_name: str): + try: + from transformers import AutoProcessor + except ImportError as exc: + raise RuntimeError( + "Processed multimodal rendering requires transformers with " + "AutoProcessor support." + ) from exc + + name = getattr(tokenizer, "name_or_path", None) + if not name: + raise RuntimeError( + f"{renderer_name} needs a processor for multimodal_output='processed'. " + "Inject `renderer._processor` or load the tokenizer with a known " + "name_or_path." + ) + return AutoProcessor.from_pretrained(name) + + +def qwen_processed_image_item_for_render( + part: dict[str, Any], + *, + processor: Any, + image_cache: dict[str, tuple[Any, int]], +) -> tuple[int, str, dict[str, Any]]: + pil = _load_pil_image(part) + image_hash = _pil_image_hash(pil) + cached = image_cache.get(image_hash) + if cached is not None: + out, num_image_tokens = cached + else: + out = processor.image_processor(images=[pil], return_tensors="np") + grid_thw = out["image_grid_thw"][0] + merge_size = processor.image_processor.merge_size + num_image_tokens = int(grid_thw.prod()) // (merge_size * merge_size) + if len(image_cache) >= _PROCESSED_IMAGE_CACHE_MAX: + image_cache.pop(next(iter(image_cache))) + image_cache[image_hash] = (out, num_image_tokens) + item = { + "pixel_values": out["pixel_values"], + "image_grid_thw": out["image_grid_thw"], + } + return num_image_tokens, image_hash, item + + class _Emitter: """Token-stream builder with BPE-safe text buffering. @@ -392,6 +487,8 @@ def __init__( config: Qwen3VLRendererConfig | None = None, ): self._tokenizer = tokenizer + self._processor: Any = None + self._image_cache: dict[str, tuple[Any, int]] = {} self.config = config or Qwen3VLRendererConfig() self.effective_thinking_retention = resolve_thinking_retention( self.config, @@ -436,6 +533,22 @@ def _encode(self, text: str) -> list[int]: return [] return self._tokenizer.encode(text, add_special_tokens=False) + def _get_processor(self): + if self._processor is None: + self._processor = load_qwen_processor(self._tokenizer, type(self).__name__) + return self._processor + + def _image_item_for_render( + self, part: dict[str, Any] + ) -> tuple[int, str, dict[str, Any]]: + if self.config.multimodal_output == "processed": + return qwen_processed_image_item_for_render( + part, + processor=self._get_processor(), + image_cache=self._image_cache, + ) + return qwen_image_item_for_render(part) + @staticmethod def _render_text_content(content: Any) -> str: """Flatten a content list to a single text string, dropping media parts. @@ -503,7 +616,7 @@ def emit_image(part: dict[str, Any]) -> None: # image data, so they ARE body content (is_content=True); # the surrounding ``<|vision_start|>`` / ``<|vision_end|>`` # markers are renderer-emitted scaffold. - n, h, mm_item = qwen_image_item_for_render(part) + n, h, mm_item = self._image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: em.text( @@ -770,7 +883,7 @@ def bridge_to_next_turn( vision_counts = {"image": prev_image_count, "video": prev_video_count} def emit_image(part: dict[str, Any]) -> None: - n, h, mm_item = qwen_image_item_for_render(part) + n, h, mm_item = self._image_item_for_render(part) vision_counts["image"] += 1 if self.config.add_vision_id: em.text( diff --git a/tests/test_multimodal_output_modes.py b/tests/test_multimodal_output_modes.py new file mode 100644 index 0000000..ed898b4 --- /dev/null +++ b/tests/test_multimodal_output_modes.py @@ -0,0 +1,71 @@ +import numpy as np +import pytest + +from renderers.kimi_k25 import kimi_processed_image_item_for_render +from renderers.qwen3_vl import qwen_processed_image_item_for_render + + +def _tiny_image_path(tmp_path): + Image = pytest.importorskip("PIL.Image") + path = tmp_path / "tiny.png" + Image.new("RGB", (16, 16), color=(120, 80, 40)).save(path) + return path + + +def test_qwen_processed_image_item_emits_processor_payload(tmp_path): + class _ImageProcessor: + merge_size = 2 + + def __call__(self, images, return_tensors): + assert len(images) == 1 + assert return_tensors == "np" + return { + "pixel_values": np.ones((4, 3), dtype=np.float32), + "image_grid_thw": np.array([[1, 4, 4]], dtype=np.int64), + } + + class _Processor: + image_processor = _ImageProcessor() + + num_tokens, image_hash, item = qwen_processed_image_item_for_render( + {"type": "image", "image": str(_tiny_image_path(tmp_path))}, + processor=_Processor(), + image_cache={}, + ) + + assert num_tokens == 4 + assert len(image_hash) == 32 + assert set(item) == {"pixel_values", "image_grid_thw"} + assert item["pixel_values"].shape == (4, 3) + assert item["image_grid_thw"].tolist() == [[1, 4, 4]] + + +def test_kimi_processed_image_item_emits_processor_payload(tmp_path): + class _ImageProcessor: + def preprocess(self, media, return_tensors): + assert len(media) == 1 + assert media[0]["type"] == "image" + assert return_tensors == "np" + return { + "pixel_values": np.ones((2, 3), dtype=np.float32), + "grid_thws": np.array([[1, 2, 2]], dtype=np.int64), + } + + def media_tokens_calculator(self, media): + assert media["type"] == "image" + return 2 + + class _Processor: + image_processor = _ImageProcessor() + + placeholder_len, image_hash, item = kimi_processed_image_item_for_render( + {"type": "image", "image": str(_tiny_image_path(tmp_path))}, + processor=_Processor(), + image_cache={}, + ) + + assert placeholder_len == 1 + assert len(image_hash) == 32 + assert set(item) == {"pixel_values", "grid_thws"} + assert item["pixel_values"].shape == (2, 3) + assert item["grid_thws"].tolist() == [[1, 2, 2]] diff --git a/tests/test_renderer_config.py b/tests/test_renderer_config.py index a35f270..4da5d57 100644 --- a/tests/test_renderer_config.py +++ b/tests/test_renderer_config.py @@ -101,13 +101,14 @@ def __init__(self, tokenizer, config): renderer = create_renderer( SimpleNamespace(name_or_path="fake/qwen35"), - AutoRendererConfig(thinking_retention="all"), + AutoRendererConfig(thinking_retention="all", multimodal_output="processed"), ) assert isinstance(renderer.config, Qwen35RendererConfig) assert renderer.config.thinking_retention == "all" + assert renderer.config.multimodal_output == "processed" # Template-level kwargs stay at their per-renderer defaults — auto - # carries only the thinking_retention flag. + # carries only shared base fields. assert renderer.config.add_vision_id is False diff --git a/uv.lock b/uv.lock index 2c6f5e6..df85d24 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-06-19T02:36:32.208558271Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P7D" [options.exclude-newer-package] @@ -173,7 +173,7 @@ name = "cuda-bindings" version = "13.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cuda-pathfinder", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "cuda-pathfinder" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/1a/fe/7351d7e586a8b4c9f89731bfe4cf0148223e8f9903ff09571f78b3fb0682/cuda_bindings-13.2.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08b395f79cb89ce0cd8effff07c4a1e20101b873c256a1aeb286e8fd7bd0f556", size = 5744254, upload-time = "2026-03-11T00:12:29.798Z" }, @@ -204,37 +204,37 @@ wheels = [ [package.optional-dependencies] cublas = [ - { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cublas" }, ] cudart = [ - { name = "nvidia-cuda-runtime", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime" }, ] cufft = [ - { name = "nvidia-cufft", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cufft" }, ] cufile = [ - { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, + { name = "nvidia-cufile" }, ] cupti = [ - { name = "nvidia-cuda-cupti", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti" }, ] curand = [ - { name = "nvidia-curand", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-curand" }, ] cusolver = [ - { name = "nvidia-cusolver", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cusolver" }, ] cusparse = [ - { name = "nvidia-cusparse", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cusparse" }, ] nvjitlink = [ - { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink" }, ] nvrtc = [ - { name = "nvidia-cuda-nvrtc", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc" }, ] nvtx = [ - { name = "nvidia-nvtx", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, + { name = "nvidia-nvtx" }, ] [[package]] @@ -260,7 +260,7 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ @@ -778,7 +778,7 @@ name = "nvidia-cudnn-cu13" version = "9.19.0.56" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "nvidia-cublas" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/f1/84/26025437c1e6b61a707442184fa0c03d083b661adf3a3eecfd6d21677740/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:6ed29ffaee1176c612daf442e4dd6cfeb6a0caa43ddcbeb59da94953030b1be4", size = 433781201, upload-time = "2026-02-03T20:40:53.805Z" }, @@ -790,7 +790,7 @@ name = "nvidia-cufft" version = "12.0.0.61" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "nvidia-nvjitlink" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554, upload-time = "2025-09-04T08:31:38.196Z" }, @@ -820,9 +820,9 @@ name = "nvidia-cusolver" version = "12.0.4.66" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, - { name = "nvidia-cusparse", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, - { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "nvidia-cublas" }, + { name = "nvidia-cusparse" }, + { name = "nvidia-nvjitlink" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760, upload-time = "2025-09-04T08:33:04.222Z" }, @@ -834,7 +834,7 @@ name = "nvidia-cusparse" version = "12.6.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "nvidia-nvjitlink" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568, upload-time = "2025-09-04T08:33:42.864Z" }, @@ -1360,6 +1360,13 @@ dependencies = [ { name = "transformers" }, ] +[package.optional-dependencies] +vision = [ + { name = "pillow" }, + { name = "torch" }, + { name = "torchvision" }, +] + [package.dev-dependencies] dev = [ { name = "pillow" }, @@ -1378,10 +1385,14 @@ requires-dist = [ { name = "numpy" }, { name = "openai", specifier = ">=1.108.1" }, { name = "openai-harmony", specifier = ">=0.0.4" }, + { name = "pillow", marker = "extra == 'vision'", specifier = ">=12.2.0" }, { name = "prime-pydantic-config", specifier = ">=0.3.0.dev83" }, { name = "tiktoken" }, + { name = "torch", marker = "extra == 'vision'", specifier = ">=2.11.0" }, + { name = "torchvision", marker = "extra == 'vision'", specifier = ">=0.26.0" }, { name = "transformers", specifier = ">=4.50.0" }, ] +provides-extras = ["vision"] [package.metadata.requires-dev] dev = [ From a7953b99a96488777c6f9b0c25783e47b4ec3afb Mon Sep 17 00:00:00 2001 From: eligotts <78387377+eligotts@users.noreply.github.com> Date: Tue, 30 Jun 2026 18:20:38 +0000 Subject: [PATCH 16/16] Trim uv lock churn --- uv.lock | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/uv.lock b/uv.lock index df85d24..05058e1 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. +exclude-newer = "2026-06-19T02:36:32.208558271Z" exclude-newer-span = "P7D" [options.exclude-newer-package] @@ -173,7 +173,7 @@ name = "cuda-bindings" version = "13.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cuda-pathfinder" }, + { name = "cuda-pathfinder", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/1a/fe/7351d7e586a8b4c9f89731bfe4cf0148223e8f9903ff09571f78b3fb0682/cuda_bindings-13.2.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08b395f79cb89ce0cd8effff07c4a1e20101b873c256a1aeb286e8fd7bd0f556", size = 5744254, upload-time = "2026-03-11T00:12:29.798Z" }, @@ -204,37 +204,37 @@ wheels = [ [package.optional-dependencies] cublas = [ - { name = "nvidia-cublas" }, + { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] cudart = [ - { name = "nvidia-cuda-runtime" }, + { name = "nvidia-cuda-runtime", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] cufft = [ - { name = "nvidia-cufft" }, + { name = "nvidia-cufft", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] cufile = [ - { name = "nvidia-cufile" }, + { name = "nvidia-cufile", marker = "sys_platform == 'linux'" }, ] cupti = [ - { name = "nvidia-cuda-cupti" }, + { name = "nvidia-cuda-cupti", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] curand = [ - { name = "nvidia-curand" }, + { name = "nvidia-curand", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] cusolver = [ - { name = "nvidia-cusolver" }, + { name = "nvidia-cusolver", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] cusparse = [ - { name = "nvidia-cusparse" }, + { name = "nvidia-cusparse", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] nvjitlink = [ - { name = "nvidia-nvjitlink" }, + { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] nvrtc = [ - { name = "nvidia-cuda-nvrtc" }, + { name = "nvidia-cuda-nvrtc", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] nvtx = [ - { name = "nvidia-nvtx" }, + { name = "nvidia-nvtx", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" }, ] [[package]] @@ -260,7 +260,7 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ @@ -778,7 +778,7 @@ name = "nvidia-cudnn-cu13" version = "9.19.0.56" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas" }, + { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/f1/84/26025437c1e6b61a707442184fa0c03d083b661adf3a3eecfd6d21677740/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:6ed29ffaee1176c612daf442e4dd6cfeb6a0caa43ddcbeb59da94953030b1be4", size = 433781201, upload-time = "2026-02-03T20:40:53.805Z" }, @@ -790,7 +790,7 @@ name = "nvidia-cufft" version = "12.0.0.61" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink" }, + { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554, upload-time = "2025-09-04T08:31:38.196Z" }, @@ -820,9 +820,9 @@ name = "nvidia-cusolver" version = "12.0.4.66" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas" }, - { name = "nvidia-cusparse" }, - { name = "nvidia-nvjitlink" }, + { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "nvidia-cusparse", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760, upload-time = "2025-09-04T08:33:04.222Z" }, @@ -834,7 +834,7 @@ name = "nvidia-cusparse" version = "12.6.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink" }, + { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568, upload-time = "2025-09-04T08:33:42.864Z" },