From 230384a588d6d456bc478432748eb6a3d5eeabc5 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 09:42:09 -0700 Subject: [PATCH 01/19] feat(types): add RendererTransport literal + ClientConfig.renderer_transport --- verifiers/types.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/verifiers/types.py b/verifiers/types.py index 4242f8a86f..8bbc6bd573 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -78,6 +78,23 @@ EndpointClient: TypeAlias = AsyncOpenAI | OpenAI | AsyncAnthropic | Anthropic MessageType = Literal["chat", "completion"] # deprecated +# Wire-shape selector shared between RendererClient and +# OpenAIChatCompletionsTokenClient. Picks which inference-server surface the +# client targets at request-build time. Same flag drives both clients so a +# single `ClientConfig.renderer_transport` setting routes consistently. +# +# - "prime_vllm_generate" (default): vLLM's TITO surface. For RendererClient +# that's POST /v1/chat/completions with a renderer-flavored request body. +# For OpenAIChatCompletionsTokenClient that's POST +# /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge +# tokenization via the server's /tokenize route. +# - "dynamo_chat_nvext": Dynamo's standard chat-completions route with +# pre-tokenized prompt carried in `nvext.token_data`. Server-side token +# IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119 +# canonical channel). Bridge tokenization runs locally via the +# transformers fast tokenizer; no /tokenize HTTP round-trip. +RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] + # Provider-agnostic message + response types class CustomBaseModel(BaseModel): @@ -1269,6 +1286,8 @@ class ClientConfig(BaseModel): Drives the renderer pool when ``client_type == "renderer"``. Defaults to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" + renderer: str = "auto" + renderer_transport: RendererTransport = "prime_vllm_generate" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer pool. Defaults to the model used in API requests.""" From 131109619bf39a26accfb33b3c6964af0896aad0 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 09:42:18 -0700 Subject: [PATCH 02/19] feat(clients): graft nvext.engine_data onto OpenAI response in parse_tokens Dynamo's vLLM and SGLang backends emit engine-emitted token IDs and per-token logprobs under `response.nvext.engine_data` when the client opts in via `nvext.extra_fields=["engine_data"]` (PR #8119). The vLLM-native path uses non-standard top-level fields (`choices[0].token_ids`, `response.prompt_token_ids`). Add a small graft inside `from_native_response.parse_tokens` that copies the engine_data fields onto the OpenAI-shaped response when present and the top-level fields are absent. The rest of parse_tokens then reads via the standard SDK attribute path regardless of backend. --- .../clients/openai_chat_completions_client.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index d7d262f4be..87a0564510 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -469,8 +469,54 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason: case _: return None + def _graft_engine_data(response: OpenAIChatResponse) -> None: + """Graft ``nvext.engine_data.*`` onto top-level response fields. + + Dynamo's vLLM/SGLang backends emit engine-side token IDs and + per-token logprobs under ``response.nvext.engine_data`` when the + client opts in via ``nvext.extra_fields=["engine_data"]`` (PR + #8119). Older vLLM-native paths set + ``response.choices[0].token_ids`` / ``response.prompt_token_ids`` + directly. This helper bridges the gap: if ``engine_data`` is + present and the top-level fields are missing, copy them across. + The rest of ``parse_tokens`` then reads via the standard openai + SDK attribute path regardless of backend. + """ + nvext = getattr(response, "nvext", None) + if nvext is None and hasattr(response, "model_dump"): + nvext = response.model_dump().get("nvext") + if not isinstance(nvext, dict): + return + engine_data = nvext.get("engine_data") + if not isinstance(engine_data, dict): + return + choice = response.choices[0] + if ( + getattr(choice, "token_ids", None) is None + and engine_data.get("completion_token_ids") is not None + ): + try: + choice.token_ids = list(engine_data["completion_token_ids"]) + except Exception: + object.__setattr__( + choice, "token_ids", list(engine_data["completion_token_ids"]) + ) + if ( + getattr(response, "prompt_token_ids", None) is None + and engine_data.get("prompt_token_ids") is not None + ): + try: + response.prompt_token_ids = list(engine_data["prompt_token_ids"]) + except Exception: + object.__setattr__( + response, + "prompt_token_ids", + list(engine_data["prompt_token_ids"]), + ) + def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" + _graft_engine_data(response) choice = response.choices[0] if not hasattr(choice, "token_ids"): return None From c766529f48d5ddaef9dde7f398fc5d8ee6414ad7 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 09:42:28 -0700 Subject: [PATCH 03/19] feat(tito): add dynamo_chat_nvext transport + local bridge tokenize The verifiers TITO client previously only spoke vLLM's TITO surface (POST /v1/chat/completions/tokens with tokens=prompt_ids; bridge tokens via /tokenize). Dynamo serves neither route, so multi-turn TITO against Dynamo silently degraded to MITO every turn-2+. This teaches OpenAIChatCompletionsTokenClient to read ClientConfig.renderer_transport and route accordingly: * prime_vllm_generate (default): unchanged. POST /v1/chat/completions/tokens with tokens=prompt_ids; bridge tokens via /tokenize HTTP. Requires vLLM >= 0.20. * dynamo_chat_nvext: POST /v1/chat/completions with placeholder messages + nvext.token_data=prompt_ids. Bridge tokens are computed locally via the model's HF fast tokenizer (no /tokenize HTTP round-trip). Server returns engine-side token IDs and logprobs under nvext.engine_data (PR #8119 channel), parsed by the OpenAIChatCompletionsClient.from_native_response graft so the rest of the pipeline is transport-agnostic. Also fix the normalize_for_comparison asymmetry that caused get_prompt_ids to never match for vf.Message-shaped input (the form MultiTurnEnv produces after maybe_normalize_messages). Drop None-valued keys so model_dump's exhaustive view is equivalent to to_native_prompt's slimmer view. --- .../openai_chat_completions_token_client.py | 266 +++++++++++++++++- 1 file changed, 258 insertions(+), 8 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 2d8cd701cc..e5ec9a4a6e 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -18,11 +18,15 @@ OpenAITool, handle_openai_overlong_prompt, ) -from verifiers.types import SamplingArgs, State +from verifiers.types import RendererTransport, SamplingArgs, State from verifiers.utils.client_utils import ( post_chat_completion_with_routed_experts_sidecar, ) +# Sentinel for the default (legacy vLLM) transport. Lets callers route +# around the legacy /tokenize body shape without changing the signature. +_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" + def _has_multimodal_content(messages) -> bool: """Check if any message contains multimodal content (images, audio). @@ -51,7 +55,25 @@ class TokenizeResponse(BaseModel): class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): - """Wrapper for custom vLLM route /v1/chat/completions/tokens via AsyncOpenAI client.""" + """Token-in / token-out chat client. + + Two transports share this class, selected via + ``ClientConfig.renderer_transport``: + + * ``prime_vllm_generate`` (default): vLLM's TITO surface. + Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids`` + and uses the server's ``/tokenize`` endpoint for bridge tokens. + Requires vLLM ``>=0.20``. + + * ``dynamo_chat_nvext``: Dynamo's standard ``/v1/chat/completions`` + route with ``nvext.token_data=prompt_ids``. Server-side response + token IDs come back via ``response.nvext.engine_data.*`` + (`OpenAIChatCompletionsClient.from_native_response` grafts them + onto the OpenAI-shaped response). Bridge tokens are computed + locally via the model's HuggingFace fast tokenizer — no + ``/tokenize`` HTTP round-trip — since Dynamo doesn't expose vLLM's + token routes. + """ @property def token_client(self) -> AsyncOpenAI: @@ -61,6 +83,38 @@ def token_client(self) -> AsyncOpenAI: base_url = base_url[:-3] return self.client.with_options(base_url=base_url) + @property + def renderer_transport(self) -> RendererTransport: + """Wire-shape selector. ``ClientConfig.renderer_transport`` if set, + else the default vLLM TITO surface. Mirrors the same field used by + ``RendererClient`` so backend selection stays in one place.""" + return cast( + RendererTransport, + getattr(self._config, "renderer_transport", _DEFAULT_TRANSPORT) + if self._config is not None + else _DEFAULT_TRANSPORT, + ) + + def _get_local_tokenizer(self, model: str): + """Lazy, per-model HF fast tokenizer for the ``dynamo_chat_nvext`` + transport. Bridge tokens are stitched locally — no ``/tokenize`` + round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` + cost once. + """ + cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {}) + if model in cache: + return cache[model] + try: + from transformers import AutoTokenizer # type: ignore[import-not-found] + except ImportError as exc: # pragma: no cover - dependency surface + raise ImportError( + "OpenAIChatCompletionsTokenClient with " + "renderer_transport='dynamo_chat_nvext' requires " + "`transformers`. Install with `pip install transformers`." + ) from exc + cache[model] = AutoTokenizer.from_pretrained(model) + return cache[model] + @handle_openai_overlong_prompt async def get_native_response( self, @@ -75,12 +129,49 @@ def normalize_sampling_args(sampling_args: SamplingArgs): if "max_tokens" in sampling_args: sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens") sampling_args["logprobs"] = True - extra_body = dict(return_token_ids=True) - if "extra_body" in sampling_args: - sampling_args["extra_body"] = { - **sampling_args["extra_body"], - **extra_body, + + # Transport-specific opt-ins. Both transports get response-side + # token IDs, just via different fields: + # + # * prime_vllm_generate (vLLM): `extra_body.return_token_ids=True` + # tells vLLM to set the non-standard `choices[0].token_ids` and + # `response.prompt_token_ids` fields. `parse_tokens` reads them + # directly. + # + # * dynamo_chat_nvext: `nvext.extra_fields=["engine_data"]` + # tells Dynamo's response builder to emit `response.nvext` + # `engine_data.{completion_token_ids, completion_logprobs, + # prompt_token_ids}` (PR #8119 channel mirrored to vLLM in + # ai-dynamo/dynamo `rl-sdk-2`). `from_native_response` grafts + # this onto the OpenAI-shaped response so `parse_tokens` + # works unmodified. `return_token_ids` is dropped because + # Dynamo's strict validator rejects it. + if self.renderer_transport == "dynamo_chat_nvext": + extra_body: dict[str, Any] = { + "nvext": {"extra_fields": ["engine_data"]} } + else: + extra_body = {"return_token_ids": True} + + if "extra_body" in sampling_args: + merged = {**sampling_args["extra_body"]} + # Merge nvext.extra_fields cumulatively rather than overwriting, + # so caller-provided extra_fields (e.g. "timing", "worker_id") + # coexist with our "engine_data" opt-in. + if "nvext" in merged and "nvext" in extra_body: + base = dict(merged.get("nvext") or {}) + inc = dict(extra_body.get("nvext") or {}) + base_ef = list(base.get("extra_fields") or []) + inc_ef = list(inc.get("extra_fields") or []) + merged_ef = list(dict.fromkeys(base_ef + inc_ef)) + merged_nvext = {**base, **inc, "extra_fields": merged_ef} + merged["nvext"] = merged_nvext + sampling_args["extra_body"] = { + **{k: v for k, v in extra_body.items() if k != "nvext"}, + **merged, + } + else: + sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body return {k: v for k, v in sampling_args.items() if v is not None} @@ -126,6 +217,16 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) + if self.renderer_transport == "dynamo_chat_nvext": + return await self._post_dynamo_chat_nvext( + prompt=prompt, + prompt_ids=prompt_ids, + model=model, + tools=tools, + sampling_args=sampling_args, + extra_headers=extra_headers, + ) + extra_body = sampling_args.pop("extra_body", {}) body = { "model": model, @@ -143,6 +244,86 @@ def normalize_sampling_args(sampling_args: SamplingArgs): extra_headers=extra_headers, ) + async def _post_dynamo_chat_nvext( + self, + prompt: OpenAIChatMessages, + prompt_ids: list[int], + model: str, + tools: list[OpenAITool] | None, + sampling_args: dict, + extra_headers: Mapping[str, str] | None, + ) -> OpenAIChatResponse: + """Post stitched ``prompt_ids`` to Dynamo's chat-completions route. + + The engine sees ``nvext.token_data`` and skips its own tokenization, + so the placeholder ``messages`` value stays small regardless of + trajectory length. Response token IDs come back via + ``response.nvext.engine_data.completion_token_ids`` and are grafted + onto ``choices[0].token_ids`` by + ``OpenAIChatCompletionsClient.from_native_response`` so the rest of + the pipeline reads them via the standard openai SDK attribute path. + """ + extra_body = dict(sampling_args.pop("extra_body", {}) or {}) + + # nvext.token_data is the canonical pre-tokenized-prompt channel. + # Merge with caller-provided nvext (extra_fields etc.) rather than + # overwriting it. normalize_sampling_args already injected + # extra_fields=["engine_data"] into extra_body.nvext, so this just + # adds token_data to that same dict. + caller_nvext = dict(extra_body.pop("nvext", None) or {}) + caller_nvext["token_data"] = prompt_ids + nvext = caller_nvext + + body: dict[str, Any] = { + "model": model, + "messages": prompt, # placeholder; engine ignores when token_data present + "stream": False, + "nvext": nvext, + } + if tools: + body["tools"] = tools + + # Sampling params that Dynamo's chat-completions surface accepts + # directly. Anything else stays in extra_body and rides as an + # unrecognized passthrough field (validate.rs PASSTHROUGH_EXTRA_FIELDS). + promotable = ( + "max_completion_tokens", + "max_tokens", + "temperature", + "top_p", + "top_k", + "min_p", + "seed", + "n", + "repetition_penalty", + "min_tokens", + "logprobs", + "top_logprobs", + "stop", + ) + for key in promotable: + value = sampling_args.get(key, extra_body.get(key)) + if value is not None and key not in body: + body[key] = value + + # Remaining extra_body keys (cache_salt, stop_token_ids, + # bad_words_token_ids, ...) pass through unchanged. The dynamo + # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist accepts these + # without rejection; unknown keys are silently ignored. + passthrough = { + k: v + for k, v in extra_body.items() + if k not in promotable and v is not None and k not in body + } + body.update(passthrough) + + return await self.client.post( + "/chat/completions", + body=body, + cast_to=ChatCompletion, + options={"headers": extra_headers} if extra_headers else {}, + ) + async def get_prompt_ids( self, state: State, @@ -176,6 +357,15 @@ def normalize_for_comparison(value: Any) -> Any: # prefix-match equality is unaffected. if normalized.get("content") == "": normalized["content"] = None + # Drop None-valued keys so model_dump's exhaustive view (which + # carries e.g. thinking_blocks=None on AssistantMessage) is + # equivalent to to_native_prompt's slimmer view (which omits + # the field entirely). Without this, vf.Message-shaped input + # (what MultiTurnEnv produces after maybe_normalize_messages) + # never matches the to_native_prompt-normalized step messages, + # which breaks the prefix match and forces TITO to fall back + # to MITO every turn-2+. + normalized = {k: v for k, v in normalized.items() if v is not None} return normalized if isinstance(value, list): return [normalize_for_comparison(item) for item in value] @@ -369,9 +559,28 @@ async def tokenize( extra_kwargs: dict | None = None, **kwargs, ) -> list[int]: - """Tokenize messages using the vLLM /tokenize API.""" + """Tokenize messages for bridge-token computation. + + Dispatched by ``renderer_transport``: + + * ``prime_vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. + * ``dynamo_chat_nvext``: local HF fast-tokenizer call. Dynamo doesn't + expose ``/tokenize``; running locally also saves two HTTP RTTs per + turn (the bridge computes both ``add_generation_prompt=True`` and + ``False`` views). The HF Rust encode releases the GIL so the + ``asyncio.to_thread`` wrap gives the event loop real parallelism. + """ if extra_kwargs is None: extra_kwargs = {} + + if self.renderer_transport == "dynamo_chat_nvext": + return await self._local_tokenize( + messages=messages, + tools=tools, + model=model, + extra_kwargs=extra_kwargs, + ) + if isinstance(messages, str): body = dict( model=model, @@ -392,3 +601,44 @@ async def tokenize( "/tokenize", body=body, cast_to=TokenizeResponse ) return tokenize_response.tokens + + async def _local_tokenize( + self, + messages: str | OpenAIChatMessages, + tools: list[OpenAITool] | None, + model: str, + extra_kwargs: dict, + ) -> list[int]: + """Local in-process tokenization for the ``dynamo_chat_nvext`` transport. + + Bridge tokenization under TITO calls this twice per turn (once for + ``add_generation_prompt=True`` and once for ``False``). Both runs + execute in a worker thread so the event loop stays free; HF fast + tokenizers release the GIL during the Rust encode pass. + """ + import asyncio + + tokenizer = self._get_local_tokenizer(model) + add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) + chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) + + if isinstance(messages, str): + def _encode_text() -> list[int]: + return list(tokenizer.encode(messages, add_special_tokens=False)) + return await asyncio.to_thread(_encode_text) + + def _encode_chat() -> list[int]: + ids = tokenizer.apply_chat_template( + messages, + tools=tools, + add_generation_prompt=add_generation_prompt, + tokenize=True, + **chat_template_kwargs, + ) + if hasattr(ids, "input_ids"): + ids = ids.input_ids + if ids and isinstance(ids[0], list): + ids = ids[0] + return [int(t) for t in ids] + + return await asyncio.to_thread(_encode_chat) From f12bf6346d8dc2ee3669d5e18265880b2d6bda00 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 13 May 2026 20:43:58 -0700 Subject: [PATCH 04/19] feat(clients): graft top-level nvext.completion_token_ids + prompt_token_ids (plan B3) --- .../clients/openai_chat_completions_client.py | 70 ++++++++++++------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 87a0564510..c4e60f3926 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -470,49 +470,67 @@ def parse_finish_reason(response: OpenAIChatResponse) -> FinishReason: return None def _graft_engine_data(response: OpenAIChatResponse) -> None: - """Graft ``nvext.engine_data.*`` onto top-level response fields. - - Dynamo's vLLM/SGLang backends emit engine-side token IDs and - per-token logprobs under ``response.nvext.engine_data`` when the - client opts in via ``nvext.extra_fields=["engine_data"]`` (PR - #8119). Older vLLM-native paths set - ``response.choices[0].token_ids`` / ``response.prompt_token_ids`` - directly. This helper bridges the gap: if ``engine_data`` is - present and the top-level fields are missing, copy them across. - The rest of ``parse_tokens`` then reads via the standard openai - SDK attribute path regardless of backend. + """Graft engine-side token IDs onto top-level response fields. + + Three coexisting wire shapes from dynamo's vLLM/SGLang backends: + + 1. ``response.nvext.engine_data.{completion_token_ids, + completion_logprobs, prompt_token_ids}`` — PR #8119 channel + (opt-in: ``nvext.extra_fields=["engine_data"]``). + 2. ``response.nvext.completion_token_ids`` — top-level shape + from rl-sdk-2 plan A4 (opt-in: + ``nvext.extra_fields=["completion_token_ids"]``). No + logprobs in this shape; logprobs ride the standard + ``choices[0].logprobs.content[*].logprob`` channel. + 3. Older vLLM-native paths set ``response.choices[0].token_ids`` + / ``response.prompt_token_ids`` directly (no grafting needed). + + This helper bridges (1) and (2) onto the top-level fields the + rest of ``parse_tokens`` reads via the standard openai SDK + attribute path. ``engine_data`` wins when both are present (it + carries more — including logprobs + prompt_token_ids). """ nvext = getattr(response, "nvext", None) if nvext is None and hasattr(response, "model_dump"): nvext = response.model_dump().get("nvext") if not isinstance(nvext, dict): return - engine_data = nvext.get("engine_data") - if not isinstance(engine_data, dict): - return choice = response.choices[0] + + engine_data = nvext.get("engine_data") + completion_token_ids_top = nvext.get("completion_token_ids") + prompt_token_ids_top = nvext.get("prompt_token_ids") + + # Prefer engine_data over top-level when both arrive: engine_data + # bundles logprobs + prompt_token_ids in one place. + completion_token_ids: list[int] | None = None + prompt_token_ids: list[int] | None = None + if isinstance(engine_data, dict): + if engine_data.get("completion_token_ids") is not None: + completion_token_ids = list(engine_data["completion_token_ids"]) + if engine_data.get("prompt_token_ids") is not None: + prompt_token_ids = list(engine_data["prompt_token_ids"]) + if completion_token_ids is None and completion_token_ids_top is not None: + completion_token_ids = list(completion_token_ids_top) + if prompt_token_ids is None and prompt_token_ids_top is not None: + prompt_token_ids = list(prompt_token_ids_top) + if ( getattr(choice, "token_ids", None) is None - and engine_data.get("completion_token_ids") is not None + and completion_token_ids is not None ): try: - choice.token_ids = list(engine_data["completion_token_ids"]) + choice.token_ids = completion_token_ids except Exception: - object.__setattr__( - choice, "token_ids", list(engine_data["completion_token_ids"]) - ) + object.__setattr__(choice, "token_ids", completion_token_ids) if ( getattr(response, "prompt_token_ids", None) is None - and engine_data.get("prompt_token_ids") is not None + and prompt_token_ids is not None ): try: - response.prompt_token_ids = list(engine_data["prompt_token_ids"]) + response.prompt_token_ids = prompt_token_ids except Exception: - object.__setattr__( - response, - "prompt_token_ids", - list(engine_data["prompt_token_ids"]), - ) + object.__setattr__(response, "prompt_token_ids", prompt_token_ids) def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" From ee3482aebfaf35e47ec73a55db9276364d63e1cd Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Thu, 14 May 2026 10:21:39 -0700 Subject: [PATCH 05/19] feat(clients): thread renderer_transport from ClientConfig to renderers.generate() --- verifiers/clients/renderer_client.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index 64ca4ec89d..ba97e1800c 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -603,15 +603,17 @@ async def get_native_response( multi_modal_data = None prompt_attribution = None - # ``renderers.client.generate`` discovers the engine's context-length - # cap on its own (via ``GET /v1/models``, cached) and raises - # ``renderers.OverlongPromptError`` on pre-flight overflow. Rebadge - # that into the verifiers-native ``OverlongPromptError`` so the - # ``MultiTurnEnv.prompt_too_long`` stop condition picks it up via - # the ``vf.Error`` hierarchy. The ``@handle_openai_overlong_prompt`` - # decorator still handles the fallback case (cap unknown → engine - # 4xx → vf.OverlongPromptError) for engines whose ``/v1/models`` - # doesn't expose ``max_model_len``. + # Thread renderer_transport from ClientConfig into generate() so the + # renderer client works against Dynamo's /v1/chat/completions surface + # as well as vLLM's /inference/v1/generate. setup_clients auto-picks + # "dynamo_chat_nvext" when client_config.backend == "dynamo". + # ``renderers.client.generate`` raises ``renderers.OverlongPromptError`` + # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops. + transport = ( + self._config.renderer_transport + if self._config is not None + else "prime_vllm_generate" + ) try: return await generate( client=self.client, @@ -623,6 +625,7 @@ async def get_native_response( prompt_attribution=prompt_attribution, tools=tools, sampling_params=sampling_params, + transport=transport, cache_salt=args.get("cache_salt") or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), From 3b58bf98c0c8b4bec247de61ed5c0ee99860f352 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 00:31:57 -0700 Subject: [PATCH 06/19] fix(clients): address PR review R1-R5 (guard transport kwarg, import ChatCompletion, scrub return_token_ids, forward sampling args, graft engine_data logprobs) + rename to dynamo_chat --- ...st_openai_chat_completions_token_client.py | 33 ++++++++++++ .../clients/openai_chat_completions_client.py | 17 ++++++ .../openai_chat_completions_token_client.py | 52 ++++++++++++------- verifiers/clients/renderer_client.py | 11 ++-- verifiers/types.py | 8 +-- 5 files changed, 95 insertions(+), 26 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 923ff118e0..46b0016416 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -293,3 +293,36 @@ async def fake_get_prompt_ids( # noqa: ANN001 assert len(recording_client.calls) == 1 assert recording_client.calls[0]["path"] == "/chat/completions/tokens" assert recording_client.calls[0]["body"]["tokens"] == [10, 20] + + +@pytest.mark.asyncio +async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): + """dynamo_chat wire body: vLLM-only keys scrubbed (R3), standard sampling + args forwarded (R4), nvext token_data + passthrough preserved.""" + recording_client = _RecordingClient() + client = OpenAIChatCompletionsTokenClient(recording_client) + + await client._post_dynamo_chat( + prompt=cast(Any, [{"role": "user", "content": ""}]), + prompt_ids=[1, 2, 3], + model="test-model", + tools=None, + sampling_args={ + "temperature": 0.5, + "presence_penalty": 0.2, # standard arg outside the old allowlist + "extra_body": { + "return_token_ids": True, # vLLM-only — must be scrubbed + "nvext": {"extra_fields": ["engine_data"]}, + "cache_salt": "ckpt-1", # passthrough must survive + }, + }, + extra_headers=None, + ) + + body = recording_client.calls[0]["body"] + assert "return_token_ids" not in body # R3 + assert body["presence_penalty"] == 0.2 # R4 + assert body["temperature"] == 0.5 + assert body["nvext"]["token_data"] == [1, 2, 3] + assert body["nvext"]["extra_fields"] == ["engine_data"] + assert body["cache_salt"] == "ckpt-1" # passthrough preserved diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index c4e60f3926..b954dd4ce0 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -505,11 +505,16 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: # bundles logprobs + prompt_token_ids in one place. completion_token_ids: list[int] | None = None prompt_token_ids: list[int] | None = None + completion_logprobs: list[float] | None = None if isinstance(engine_data, dict): if engine_data.get("completion_token_ids") is not None: completion_token_ids = list(engine_data["completion_token_ids"]) if engine_data.get("prompt_token_ids") is not None: prompt_token_ids = list(engine_data["prompt_token_ids"]) + if engine_data.get("completion_logprobs") is not None: + completion_logprobs = [ + float(x) for x in engine_data["completion_logprobs"] + ] if completion_token_ids is None and completion_token_ids_top is not None: completion_token_ids = list(completion_token_ids_top) if prompt_token_ids is None and prompt_token_ids_top is not None: @@ -531,6 +536,18 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: response.prompt_token_ids = prompt_token_ids except Exception: object.__setattr__(response, "prompt_token_ids", prompt_token_ids) + # Dynamo returns logprobs only under engine_data, not + # choices[0].logprobs. Synthesize the standard shape so parse_tokens + # (which requires choices[0].logprobs.content) can read them. + if ( + getattr(choice, "logprobs", None) is None + and completion_logprobs is not None + ): + synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]} + try: + choice.logprobs = synthesized + except Exception: + object.__setattr__(choice, "logprobs", synthesized) def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: assert len(response.choices) == 1, "Response should always have one choice" diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index e5ec9a4a6e..4ddb17dab5 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -3,6 +3,7 @@ from openai import AsyncOpenAI, BaseModel from openai.types.chat import ( + ChatCompletion, ChatCompletionAssistantMessageParam, ) from openai.types.chat.chat_completion_message_function_tool_call_param import ( @@ -25,7 +26,7 @@ # Sentinel for the default (legacy vLLM) transport. Lets callers route # around the legacy /tokenize body shape without changing the signature. -_DEFAULT_TRANSPORT: RendererTransport = "prime_vllm_generate" +_DEFAULT_TRANSPORT: RendererTransport = "vllm_generate" def _has_multimodal_content(messages) -> bool: @@ -60,12 +61,12 @@ class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): Two transports share this class, selected via ``ClientConfig.renderer_transport``: - * ``prime_vllm_generate`` (default): vLLM's TITO surface. + * ``vllm_generate`` (default): vLLM's TITO surface. Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids`` and uses the server's ``/tokenize`` endpoint for bridge tokens. Requires vLLM ``>=0.20``. - * ``dynamo_chat_nvext``: Dynamo's standard ``/v1/chat/completions`` + * ``dynamo_chat``: Dynamo's standard ``/v1/chat/completions`` route with ``nvext.token_data=prompt_ids``. Server-side response token IDs come back via ``response.nvext.engine_data.*`` (`OpenAIChatCompletionsClient.from_native_response` grafts them @@ -96,7 +97,7 @@ def renderer_transport(self) -> RendererTransport: ) def _get_local_tokenizer(self, model: str): - """Lazy, per-model HF fast tokenizer for the ``dynamo_chat_nvext`` + """Lazy, per-model HF fast tokenizer for the ``dynamo_chat`` transport. Bridge tokens are stitched locally — no ``/tokenize`` round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` cost once. @@ -109,7 +110,7 @@ def _get_local_tokenizer(self, model: str): except ImportError as exc: # pragma: no cover - dependency surface raise ImportError( "OpenAIChatCompletionsTokenClient with " - "renderer_transport='dynamo_chat_nvext' requires " + "renderer_transport='dynamo_chat' requires " "`transformers`. Install with `pip install transformers`." ) from exc cache[model] = AutoTokenizer.from_pretrained(model) @@ -133,12 +134,12 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # Transport-specific opt-ins. Both transports get response-side # token IDs, just via different fields: # - # * prime_vllm_generate (vLLM): `extra_body.return_token_ids=True` + # * vllm_generate (vLLM): `extra_body.return_token_ids=True` # tells vLLM to set the non-standard `choices[0].token_ids` and # `response.prompt_token_ids` fields. `parse_tokens` reads them # directly. # - # * dynamo_chat_nvext: `nvext.extra_fields=["engine_data"]` + # * dynamo_chat: `nvext.extra_fields=["engine_data"]` # tells Dynamo's response builder to emit `response.nvext` # `engine_data.{completion_token_ids, completion_logprobs, # prompt_token_ids}` (PR #8119 channel mirrored to vLLM in @@ -146,7 +147,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # this onto the OpenAI-shaped response so `parse_tokens` # works unmodified. `return_token_ids` is dropped because # Dynamo's strict validator rejects it. - if self.renderer_transport == "dynamo_chat_nvext": + if self.renderer_transport == "dynamo_chat": extra_body: dict[str, Any] = { "nvext": {"extra_fields": ["engine_data"]} } @@ -217,8 +218,8 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) - if self.renderer_transport == "dynamo_chat_nvext": - return await self._post_dynamo_chat_nvext( + if self.renderer_transport == "dynamo_chat": + return await self._post_dynamo_chat( prompt=prompt, prompt_ids=prompt_ids, model=model, @@ -244,7 +245,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): extra_headers=extra_headers, ) - async def _post_dynamo_chat_nvext( + async def _post_dynamo_chat( self, prompt: OpenAIChatMessages, prompt_ids: list[int], @@ -300,20 +301,33 @@ async def _post_dynamo_chat_nvext( "logprobs", "top_logprobs", "stop", + # Standard chat-completions sampling args (parity with the vLLM path, + # which spreads the full normalized sampling_args). + "presence_penalty", + "frequency_penalty", + "logit_bias", + "response_format", + "parallel_tool_calls", ) for key in promotable: value = sampling_args.get(key, extra_body.get(key)) if value is not None and key not in body: body[key] = value + # vLLM-only extra_body keys Dynamo's strict validator rejects — never + # forward these on the dynamo_chat wire (e.g. return_token_ids, which + # the vLLM path uses for TITO but Dynamo 400s on). + vllm_only = {"return_token_ids"} # Remaining extra_body keys (cache_salt, stop_token_ids, - # bad_words_token_ids, ...) pass through unchanged. The dynamo - # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist accepts these - # without rejection; unknown keys are silently ignored. + # bad_words_token_ids, ...) pass through unchanged via the dynamo + # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist. passthrough = { k: v for k, v in extra_body.items() - if k not in promotable and v is not None and k not in body + if k not in promotable + and k not in vllm_only + and v is not None + and k not in body } body.update(passthrough) @@ -563,8 +577,8 @@ async def tokenize( Dispatched by ``renderer_transport``: - * ``prime_vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. - * ``dynamo_chat_nvext``: local HF fast-tokenizer call. Dynamo doesn't + * ``vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. + * ``dynamo_chat``: local HF fast-tokenizer call. Dynamo doesn't expose ``/tokenize``; running locally also saves two HTTP RTTs per turn (the bridge computes both ``add_generation_prompt=True`` and ``False`` views). The HF Rust encode releases the GIL so the @@ -573,7 +587,7 @@ async def tokenize( if extra_kwargs is None: extra_kwargs = {} - if self.renderer_transport == "dynamo_chat_nvext": + if self.renderer_transport == "dynamo_chat": return await self._local_tokenize( messages=messages, tools=tools, @@ -609,7 +623,7 @@ async def _local_tokenize( model: str, extra_kwargs: dict, ) -> list[int]: - """Local in-process tokenization for the ``dynamo_chat_nvext`` transport. + """Local in-process tokenization for the ``dynamo_chat`` transport. Bridge tokenization under TITO calls this twice per turn (once for ``add_generation_prompt=True`` and once for ``False``). Both runs diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index ba97e1800c..cc0acd3556 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -606,14 +606,19 @@ async def get_native_response( # Thread renderer_transport from ClientConfig into generate() so the # renderer client works against Dynamo's /v1/chat/completions surface # as well as vLLM's /inference/v1/generate. setup_clients auto-picks - # "dynamo_chat_nvext" when client_config.backend == "dynamo". + # "dynamo_chat" when client_config.backend == "dynamo". # ``renderers.client.generate`` raises ``renderers.OverlongPromptError`` # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops. transport = ( self._config.renderer_transport if self._config is not None - else "prime_vllm_generate" + else "vllm_generate" ) + # Only pass transport= when non-default: a pinned ``renderers`` may + # predate the kwarg, so the default path must use the upstream signature. + generate_kwargs: dict[str, Any] = {} + if transport != "vllm_generate": + generate_kwargs["transport"] = transport try: return await generate( client=self.client, @@ -625,11 +630,11 @@ async def get_native_response( prompt_attribution=prompt_attribution, tools=tools, sampling_params=sampling_params, - transport=transport, cache_salt=args.get("cache_salt") or sampling_params.pop("cache_salt", None), priority=args.get("priority") or sampling_params.pop("priority", None), extra_headers=extra_headers or None, + **generate_kwargs, ) except RendererOverlongPromptError as exc: raise OverlongPromptError(str(exc)) from exc diff --git a/verifiers/types.py b/verifiers/types.py index 8bbc6bd573..0d2dcb8abf 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -83,17 +83,17 @@ # client targets at request-build time. Same flag drives both clients so a # single `ClientConfig.renderer_transport` setting routes consistently. # -# - "prime_vllm_generate" (default): vLLM's TITO surface. For RendererClient +# - "vllm_generate" (default): vLLM's TITO surface. For RendererClient # that's POST /v1/chat/completions with a renderer-flavored request body. # For OpenAIChatCompletionsTokenClient that's POST # /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge # tokenization via the server's /tokenize route. -# - "dynamo_chat_nvext": Dynamo's standard chat-completions route with +# - "dynamo_chat": Dynamo's standard chat-completions route with # pre-tokenized prompt carried in `nvext.token_data`. Server-side token # IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119 # canonical channel). Bridge tokenization runs locally via the # transformers fast tokenizer; no /tokenize HTTP round-trip. -RendererTransport = Literal["prime_vllm_generate", "dynamo_chat_nvext"] +RendererTransport = Literal["vllm_generate", "dynamo_chat"] # Provider-agnostic message + response types @@ -1287,7 +1287,7 @@ class ClientConfig(BaseModel): to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" renderer: str = "auto" - renderer_transport: RendererTransport = "prime_vllm_generate" + renderer_transport: RendererTransport = "vllm_generate" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer pool. Defaults to the model used in API requests.""" From 7a85b8469a2599407fe726e57550a794a929e77a Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 00:41:48 -0700 Subject: [PATCH 07/19] fix(clients): graft engine_data logprobs even when choice logprobs is content-less; trim test comments --- ...st_openai_chat_completions_token_client.py | 49 +++++++++++++++++-- .../clients/openai_chat_completions_client.py | 16 ++++-- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 46b0016416..5391aee273 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -297,8 +297,8 @@ async def fake_get_prompt_ids( # noqa: ANN001 @pytest.mark.asyncio async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): - """dynamo_chat wire body: vLLM-only keys scrubbed (R3), standard sampling - args forwarded (R4), nvext token_data + passthrough preserved.""" + """dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args + forwarded, nvext token_data + passthrough preserved.""" recording_client = _RecordingClient() client = OpenAIChatCompletionsTokenClient(recording_client) @@ -320,9 +320,48 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): ) body = recording_client.calls[0]["body"] - assert "return_token_ids" not in body # R3 - assert body["presence_penalty"] == 0.2 # R4 + assert "return_token_ids" not in body + assert body["presence_penalty"] == 0.2 assert body["temperature"] == 0.5 assert body["nvext"]["token_data"] == [1, 2, 3] assert body["nvext"]["extra_fields"] == ["engine_data"] - assert body["cache_salt"] == "ckpt-1" # passthrough preserved + assert body["cache_salt"] == "ckpt-1" + + +@pytest.mark.asyncio +async def test_graft_engine_data_synthesizes_logprobs_when_content_less(): + """engine_data.completion_logprobs must be grafted even when the choice + carries a content-less logprobs object (not only when absent).""" + from openai.types.chat import ChatCompletion + + client = OpenAIChatCompletionsClient(_NoopClient()) + native = ChatCompletion.model_validate( + { + "id": "x", + "object": "chat.completion", + "created": 1, + "model": "test-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + "logprobs": {"content": None}, # present but content-less + } + ], + "nvext": { + "engine_data": { + "completion_token_ids": [10, 11], + "prompt_token_ids": [1, 2, 3], + "completion_logprobs": [-0.1, -0.2], + } + }, + } + ) + + vf_response = await client.from_native_response(native) + tokens = vf_response.message.tokens + assert tokens is not None # would be None before the fix (TITO lost) + assert tokens.completion_ids == [10, 11] + assert tokens.prompt_ids == [1, 2, 3] + assert tokens.completion_logprobs == [-0.1, -0.2] diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index b954dd4ce0..0da8b410cd 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -538,11 +538,17 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: object.__setattr__(response, "prompt_token_ids", prompt_token_ids) # Dynamo returns logprobs only under engine_data, not # choices[0].logprobs. Synthesize the standard shape so parse_tokens - # (which requires choices[0].logprobs.content) can read them. - if ( - getattr(choice, "logprobs", None) is None - and completion_logprobs is not None - ): + # (which requires choices[0].logprobs.content) can read them. Graft + # whenever the choice has no usable logprobs content — i.e. logprobs + # is missing OR present-but-content-less (empty/None content) — not + # only when it is absent entirely. + existing_lp = getattr(choice, "logprobs", None) + existing_content = ( + existing_lp.get("content") + if isinstance(existing_lp, dict) + else getattr(existing_lp, "content", None) + ) + if completion_logprobs is not None and not existing_content: synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]} try: choice.logprobs = synthesized From 7cbb603ef6fead35e88d3d24c2a3638afa28c58a Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 00:45:28 -0700 Subject: [PATCH 08/19] fix(clients): dynamo_chat forwards full normalized sampling_args (drop fixed allowlist) for vLLM-path parity --- ...st_openai_chat_completions_token_client.py | 6 ++- .../openai_chat_completions_token_client.py | 54 ++++--------------- 2 files changed, 13 insertions(+), 47 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index 5391aee273..b3e5a798f4 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -309,11 +309,12 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): tools=None, sampling_args={ "temperature": 0.5, - "presence_penalty": 0.2, # standard arg outside the old allowlist + "presence_penalty": 0.2, + "reasoning_effort": "high", # arbitrary key: full parity, not an allowlist "extra_body": { "return_token_ids": True, # vLLM-only — must be scrubbed "nvext": {"extra_fields": ["engine_data"]}, - "cache_salt": "ckpt-1", # passthrough must survive + "cache_salt": "ckpt-1", }, }, extra_headers=None, @@ -323,6 +324,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): assert "return_token_ids" not in body assert body["presence_penalty"] == 0.2 assert body["temperature"] == 0.5 + assert body["reasoning_effort"] == "high" assert body["nvext"]["token_data"] == [1, 2, 3] assert body["nvext"]["extra_fields"] == ["engine_data"] assert body["cache_salt"] == "ckpt-1" diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 4ddb17dab5..6a3ee0164a 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -284,52 +284,16 @@ async def _post_dynamo_chat( if tools: body["tools"] = tools - # Sampling params that Dynamo's chat-completions surface accepts - # directly. Anything else stays in extra_body and rides as an - # unrecognized passthrough field (validate.rs PASSTHROUGH_EXTRA_FIELDS). - promotable = ( - "max_completion_tokens", - "max_tokens", - "temperature", - "top_p", - "top_k", - "min_p", - "seed", - "n", - "repetition_penalty", - "min_tokens", - "logprobs", - "top_logprobs", - "stop", - # Standard chat-completions sampling args (parity with the vLLM path, - # which spreads the full normalized sampling_args). - "presence_penalty", - "frequency_penalty", - "logit_bias", - "response_format", - "parallel_tool_calls", - ) - for key in promotable: - value = sampling_args.get(key, extra_body.get(key)) - if value is not None and key not in body: - body[key] = value - - # vLLM-only extra_body keys Dynamo's strict validator rejects — never - # forward these on the dynamo_chat wire (e.g. return_token_ids, which - # the vLLM path uses for TITO but Dynamo 400s on). + # Forward the full normalized sampling_args (parity with the vLLM path, + # which spreads all of sampling_args), then remaining extra_body keys — + # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids). + # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS. vllm_only = {"return_token_ids"} - # Remaining extra_body keys (cache_salt, stop_token_ids, - # bad_words_token_ids, ...) pass through unchanged via the dynamo - # frontend's PASSTHROUGH_EXTRA_FIELDS allowlist. - passthrough = { - k: v - for k, v in extra_body.items() - if k not in promotable - and k not in vllm_only - and v is not None - and k not in body - } - body.update(passthrough) + for source in (sampling_args, extra_body): + for key, value in source.items(): + if value is None or key in vllm_only or key in body: + continue + body[key] = value return await self.client.post( "/chat/completions", From 6b2dfbbaa4272b131b620b63dadff8dd92eaf9ed Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 01:07:28 -0700 Subject: [PATCH 09/19] fix(clients): centralize Dynamo denylist scrub (MITO+TITO), guard logprob length, tokenizer override, drop dead renderer field --- ...st_openai_chat_completions_token_client.py | 2 ++ .../clients/openai_chat_completions_client.py | 7 ++++- .../openai_chat_completions_token_client.py | 26 +++++++++++++++++-- verifiers/types.py | 1 - 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index b3e5a798f4..ad962c5ba2 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -311,6 +311,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): "temperature": 0.5, "presence_penalty": 0.2, "reasoning_effort": "high", # arbitrary key: full parity, not an allowlist + "spaces_between_special_tokens": False, # vLLM-only — must be scrubbed "extra_body": { "return_token_ids": True, # vLLM-only — must be scrubbed "nvext": {"extra_fields": ["engine_data"]}, @@ -322,6 +323,7 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): body = recording_client.calls[0]["body"] assert "return_token_ids" not in body + assert "spaces_between_special_tokens" not in body assert body["presence_penalty"] == 0.2 assert body["temperature"] == 0.5 assert body["reasoning_effort"] == "high" diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 0da8b410cd..f8e7e80f4e 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -548,7 +548,12 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: if isinstance(existing_lp, dict) else getattr(existing_lp, "content", None) ) - if completion_logprobs is not None and not existing_content: + if ( + completion_logprobs is not None + and completion_token_ids is not None + and len(completion_logprobs) == len(completion_token_ids) + and not existing_content + ): synthesized = {"content": [{"logprob": lp} for lp in completion_logprobs]} try: choice.logprobs = synthesized diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 6a3ee0164a..4725a74612 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -28,6 +28,12 @@ # around the legacy /tokenize body shape without changing the signature. _DEFAULT_TRANSPORT: RendererTransport = "vllm_generate" +# vLLM/prime-only sampling keys Dynamo's strict validator rejects — scrubbed +# from every dynamo_chat request body (both MITO and TITO paths). +_DYNAMO_DROP_KEYS = frozenset( + {"return_token_ids", "spaces_between_special_tokens", "priority"} +) + def _has_multimodal_content(messages) -> bool: """Check if any message contains multimodal content (images, audio). @@ -175,6 +181,15 @@ def normalize_sampling_args(sampling_args: SamplingArgs): sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body + if self.renderer_transport == "dynamo_chat": + # Drop vLLM/prime-only keys Dynamo rejects from both top-level + # args and extra_body, so MITO + TITO paths send a clean body. + eb = sampling_args.get("extra_body") + if isinstance(eb, dict): + for k in _DYNAMO_DROP_KEYS: + eb.pop(k, None) + for k in _DYNAMO_DROP_KEYS: + sampling_args.pop(k, None) return {k: v for k, v in sampling_args.items() if v is not None} sampling_args = normalize_sampling_args(sampling_args) @@ -288,7 +303,7 @@ async def _post_dynamo_chat( # which spreads all of sampling_args), then remaining extra_body keys — # minus vLLM-only keys Dynamo's strict validator rejects (return_token_ids). # Unknown keys ride through the dynamo frontend's PASSTHROUGH_EXTRA_FIELDS. - vllm_only = {"return_token_ids"} + vllm_only = _DYNAMO_DROP_KEYS for source in (sampling_args, extra_body): for key, value in source.items(): if value is None or key in vllm_only or key in body: @@ -596,7 +611,14 @@ async def _local_tokenize( """ import asyncio - tokenizer = self._get_local_tokenizer(model) + # Prefer the explicit tokenizer override so model aliases don't silently + # disable turn-2+ TITO (fall back to the served model name). + tok_model = ( + getattr(self._config, "renderer_model_name", None) or model + if self._config is not None + else model + ) + tokenizer = self._get_local_tokenizer(tok_model) add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) diff --git a/verifiers/types.py b/verifiers/types.py index 0d2dcb8abf..ed1ffdb145 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -1286,7 +1286,6 @@ class ClientConfig(BaseModel): Drives the renderer pool when ``client_type == "renderer"``. Defaults to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" - renderer: str = "auto" renderer_transport: RendererTransport = "vllm_generate" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer From 9d260d3968c13201e8024ac628d3f8d024f4c329 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 01:31:14 -0700 Subject: [PATCH 10/19] fix(clients): enforce logprobs/ids length invariant in parse_tokens (all paths) --- verifiers/clients/openai_chat_completions_client.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index f8e7e80f4e..2084ea6f33 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -600,6 +600,11 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: logprobs_content = response.choices[0].logprobs["content"] completion_logprobs = [token["logprob"] for token in logprobs_content] + if len(completion_logprobs) != len(completion_ids): + # Engine returned mismatched logprobs/ids — drop rather than emit + # out-of-sync ResponseTokens. + return None + choice_extra = choice.model_extra or {} return ResponseTokens( prompt_ids=prompt_ids, From 4aa48a4dfcd8ae6907e28e9629b4ca17e5418105 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 03:04:29 -0700 Subject: [PATCH 11/19] fix(clients): centralize tokenizer override in _get_local_tokenizer; route dynamo TITO through routed-experts sidecar helper --- .../openai_chat_completions_token_client.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 4725a74612..8de5a10ca5 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -3,7 +3,6 @@ from openai import AsyncOpenAI, BaseModel from openai.types.chat import ( - ChatCompletion, ChatCompletionAssistantMessageParam, ) from openai.types.chat.chat_completion_message_function_tool_call_param import ( @@ -108,6 +107,14 @@ def _get_local_tokenizer(self, model: str): round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` cost once. """ + # Honor the explicit tokenizer override (renderer_model_name) so model + # aliases don't break bridge stitching; fall back to the served model. + override = ( + getattr(self._config, "renderer_model_name", None) + if self._config is not None + else None + ) + model = override or model cache: dict[str, Any] = self.__dict__.setdefault("_tokenizer_cache", {}) if model in cache: return cache[model] @@ -310,11 +317,14 @@ async def _post_dynamo_chat( continue body[key] = value - return await self.client.post( + # Use the sidecar-aware post (same as the vLLM TITO + MITO paths) so any + # routed_experts blob is streamed, not JSON-parsed. dynamo_chat opts into + # extra_fields=["engine_data"] only, so routed_experts is normally absent. + return await post_chat_completion_with_routed_experts_sidecar( + self.client, "/chat/completions", body=body, - cast_to=ChatCompletion, - options={"headers": extra_headers} if extra_headers else {}, + extra_headers=extra_headers, ) async def get_prompt_ids( @@ -611,14 +621,7 @@ async def _local_tokenize( """ import asyncio - # Prefer the explicit tokenizer override so model aliases don't silently - # disable turn-2+ TITO (fall back to the served model name). - tok_model = ( - getattr(self._config, "renderer_model_name", None) or model - if self._config is not None - else model - ) - tokenizer = self._get_local_tokenizer(tok_model) + tokenizer = self._get_local_tokenizer(model) add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) From d713edc7ab58b4f3f2ad79e410f09d2c4166042c Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Tue, 9 Jun 2026 03:11:41 -0700 Subject: [PATCH 12/19] fix(clients): load HF tokenizer inside worker thread (cache-miss from_pretrained must not block the event loop) --- verifiers/clients/openai_chat_completions_token_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 8de5a10ca5..427ed4ee1e 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -621,16 +621,19 @@ async def _local_tokenize( """ import asyncio - tokenizer = self._get_local_tokenizer(model) add_generation_prompt = bool(extra_kwargs.get("add_generation_prompt", True)) chat_template_kwargs = dict(extra_kwargs.get("chat_template_kwargs") or {}) + # Load the tokenizer inside the worker thread: a cache miss runs the + # synchronous AutoTokenizer.from_pretrained, which must not block the loop. if isinstance(messages, str): def _encode_text() -> list[int]: + tokenizer = self._get_local_tokenizer(model) return list(tokenizer.encode(messages, add_special_tokens=False)) return await asyncio.to_thread(_encode_text) def _encode_chat() -> list[int]: + tokenizer = self._get_local_tokenizer(model) ids = tokenizer.apply_chat_template( messages, tools=tools, From 193c5491c8fd5a3168dfefb9f4d6fa2868fdeb50 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 02:33:59 -0700 Subject: [PATCH 13/19] feat(types): add dtype to RoutedExpertsPayload contract --- verifiers/types.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/verifiers/types.py b/verifiers/types.py index ed1ffdb145..aa408f3a7c 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -228,6 +228,10 @@ class RoutedExpertsPayload(TypedDict): data: Any shape: list[int] start: int + # Element dtype of the decoded expert-id buffer ("uint8" / "uint16" / + # "int32"). NotRequired so payloads serialized before this field still + # validate; consumers default to "uint8" (the historical encoding). + dtype: NotRequired[str] class ResponseTokens(CustomBaseModel): From c30dad26f8906f8bdbfa93806e9870c9af7bf96b Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 11:05:07 -0700 Subject: [PATCH 14/19] fix(routed_experts): tighten dtype to Literal and make sidecar stripper key-order robust --- tests/test_trajectory_processing.py | 23 +++++++++++++++++++++++ verifiers/types.py | 8 ++++---- verifiers/utils/response_utils.py | 17 +++++++++++++---- 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py index 386e4fd947..4d4ee30111 100644 --- a/tests/test_trajectory_processing.py +++ b/tests/test_trajectory_processing.py @@ -483,3 +483,26 @@ def test_trajectory_step_mask_combining(): assert token_ids == [1, 2, 3, 4, 5] assert mask == [0, 0, 0, 1, 1] assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2] + + +def test_strip_routed_experts_data_key_order_robust(): + """The zero-copy stripper must find ``data`` regardless of key order + (``dtype``/``shape``/``start`` may precede it) and no-op when absent.""" + from verifiers.utils.response_utils import strip_routed_experts_data + + # data first (fast path) + raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}' + stripped, blob = strip_routed_experts_data(raw) + assert blob is not None and blob.tobytes() == b"QUJD" + assert b'"data":""' in stripped + + # dtype/shape/start before data — must still strip the blob + raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}' + stripped2, blob2 = strip_routed_experts_data(raw2) + assert blob2 is not None and blob2.tobytes() == b"WFla" + assert b'"data":""' in stripped2 + + # absent — no-op passthrough + raw3 = b'{"choices":[{"token_ids":[1,2]}]}' + stripped3, blob3 = strip_routed_experts_data(raw3) + assert blob3 is None and stripped3 == raw3 diff --git a/verifiers/types.py b/verifiers/types.py index aa408f3a7c..62226d1f47 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -228,10 +228,10 @@ class RoutedExpertsPayload(TypedDict): data: Any shape: list[int] start: int - # Element dtype of the decoded expert-id buffer ("uint8" / "uint16" / - # "int32"). NotRequired so payloads serialized before this field still - # validate; consumers default to "uint8" (the historical encoding). - dtype: NotRequired[str] + # Element dtype of the decoded expert-id buffer. NotRequired so payloads + # serialized before this field still validate; a decoder that doesn't see + # it falls back to "uint8" (the historical encoding). + dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]] class ResponseTokens(CustomBaseModel): diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py index 7bc13bc22d..336f59f17b 100644 --- a/verifiers/utils/response_utils.py +++ b/verifiers/utils/response_utils.py @@ -9,15 +9,24 @@ TrajectoryStepTokens, ) -ROUTED_EXPERTS_DATA_PREFIX = b'"routed_experts":{"data":"' +ROUTED_EXPERTS_OBJ_PREFIX = b'"routed_experts":{' +ROUTED_EXPERTS_DATA_KEY = b'"data":"' def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]: - data_start = raw.find(ROUTED_EXPERTS_DATA_PREFIX) - if data_start < 0: + # Zero-copy fast path for the large base64 routed_experts blob: find the + # "data" value inside the routed_experts object regardless of key order + # (shape/start/dtype may precede it), slice it out before JSON parsing. + # No-op fallback (consumer b64-decodes the string) if the shape isn't found. + obj_start = raw.find(ROUTED_EXPERTS_OBJ_PREFIX) + if obj_start < 0: return raw, None - data_start += len(ROUTED_EXPERTS_DATA_PREFIX) + data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start) + if data_key < 0: + return raw, None + + data_start = data_key + len(ROUTED_EXPERTS_DATA_KEY) data_end = raw.index(b'"', data_start) routed_data = memoryview(raw)[data_start:data_end] stripped = raw[:data_start] + raw[data_end:] From ea53210208163cc115615d5468040fce26fc6701 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 11:18:37 -0700 Subject: [PATCH 15/19] fix(routed_experts): bound sidecar stripper to the routed_experts object; document dtype field --- docs/reference.md | 1 + tests/test_trajectory_processing.py | 6 ++++++ verifiers/utils/response_utils.py | 10 +++++++++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index a50811f4aa..26b02f258d 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict): data: Any # actually memoryview; kept opaque so Pydantic skips schema validation shape: list[int] start: int + dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]] # optional; absent → uint8 ``` ### TrajectoryStepTokens diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py index 4d4ee30111..3ebe7cdbb2 100644 --- a/tests/test_trajectory_processing.py +++ b/tests/test_trajectory_processing.py @@ -502,6 +502,12 @@ def test_strip_routed_experts_data_key_order_robust(): assert blob2 is not None and blob2.tobytes() == b"WFla" assert b'"data":""' in stripped2 + # routed_experts object lacks data; an unrelated sibling has data — must + # NOT be mistaken for routed experts (search bounded to the object). + raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}' + stripped4, blob4 = strip_routed_experts_data(raw4) + assert blob4 is None and stripped4 == raw4 + # absent — no-op passthrough raw3 = b'{"choices":[{"token_ids":[1,2]}]}' stripped3, blob3 = strip_routed_experts_data(raw3) diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py index 336f59f17b..64539bda2a 100644 --- a/verifiers/utils/response_utils.py +++ b/verifiers/utils/response_utils.py @@ -22,7 +22,15 @@ def strip_routed_experts_data(raw: bytes) -> tuple[bytes, memoryview | None]: if obj_start < 0: return raw, None - data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start) + # Bound the search to the routed_experts object so a missing `data` here + # can't match an unrelated sibling's `data` later in the response. The + # object's values (base64 string, int shape/start, dtype) contain no `}`, + # so the first `}` after the prefix closes it. + obj_end = raw.find(b"}", obj_start) + if obj_end < 0: + return raw, None + + data_key = raw.find(ROUTED_EXPERTS_DATA_KEY, obj_start, obj_end) if data_key < 0: return raw, None From b31ff2d767f482178ecb68ba73ff44a67ec1a7eb Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Wed, 10 Jun 2026 17:33:46 -0700 Subject: [PATCH 16/19] docs(clients): drop PR-number and branch/plan references from dynamo_chat comments --- verifiers/clients/openai_chat_completions_client.py | 4 ++-- verifiers/clients/openai_chat_completions_token_client.py | 3 +-- verifiers/types.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 2084ea6f33..0246b9f669 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -475,10 +475,10 @@ def _graft_engine_data(response: OpenAIChatResponse) -> None: Three coexisting wire shapes from dynamo's vLLM/SGLang backends: 1. ``response.nvext.engine_data.{completion_token_ids, - completion_logprobs, prompt_token_ids}`` — PR #8119 channel + completion_logprobs, prompt_token_ids}`` (opt-in: ``nvext.extra_fields=["engine_data"]``). 2. ``response.nvext.completion_token_ids`` — top-level shape - from rl-sdk-2 plan A4 (opt-in: + (opt-in: ``nvext.extra_fields=["completion_token_ids"]``). No logprobs in this shape; logprobs ride the standard ``choices[0].logprobs.content[*].logprob`` channel. diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 427ed4ee1e..36fd9f08cb 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -155,8 +155,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # * dynamo_chat: `nvext.extra_fields=["engine_data"]` # tells Dynamo's response builder to emit `response.nvext` # `engine_data.{completion_token_ids, completion_logprobs, - # prompt_token_ids}` (PR #8119 channel mirrored to vLLM in - # ai-dynamo/dynamo `rl-sdk-2`). `from_native_response` grafts + # prompt_token_ids}`. `from_native_response` grafts # this onto the OpenAI-shaped response so `parse_tokens` # works unmodified. `return_token_ids` is dropped because # Dynamo's strict validator rejects it. diff --git a/verifiers/types.py b/verifiers/types.py index 62226d1f47..fd511603c3 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -90,8 +90,8 @@ # tokenization via the server's /tokenize route. # - "dynamo_chat": Dynamo's standard chat-completions route with # pre-tokenized prompt carried in `nvext.token_data`. Server-side token -# IDs come back via `nvext.engine_data.completion_token_ids` (PR #8119 -# canonical channel). Bridge tokenization runs locally via the +# IDs come back via `nvext.engine_data.completion_token_ids` (the +# canonical Dynamo channel). Bridge tokenization runs locally via the # transformers fast tokenizer; no /tokenize HTTP round-trip. RendererTransport = Literal["vllm_generate", "dynamo_chat"] From 17c819b4ae302bee04fb2fe47530fb0cc58d044e Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Fri, 12 Jun 2026 02:08:13 -0700 Subject: [PATCH 17/19] fix(dynamo): preserve token-data and routed experts sidecars --- ...st_openai_chat_completions_token_client.py | 69 +++++++++++++++++++ .../openai_chat_completions_token_client.py | 2 +- verifiers/utils/client_utils.py | 31 ++++++++- 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index ad962c5ba2..b442880f75 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -1,3 +1,5 @@ +import base64 +import json from typing import Any, cast import httpx @@ -8,6 +10,7 @@ OpenAIChatCompletionsTokenClient, ) from verifiers.types import State +from verifiers.utils.client_utils import post_chat_completion_with_routed_experts_sidecar class _NoopClient: @@ -46,6 +49,40 @@ async def post( ) +class _DynamoRoutedExpertsClient(_NoopClient): + async def post( + self, path: str, body: dict[str, Any], cast_to: type, **kwargs: Any + ) -> Any: + payload = { + "id": "x", + "object": "chat.completion", + "created": 1, + "model": "test-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + } + ], + "nvext": { + "engine_data": { + "completion_token_ids": [10], + "routed_experts": { + "data": base64.b64encode(b"abc").decode("ascii"), + "shape": [3, 1, 1], + "start": 0, + "dtype": "uint8", + }, + } + }, + } + return httpx.Response( + 200, + content=json.dumps(payload, separators=(",", ":")).encode("utf-8"), + ) + + class _PromptIdTestClient(OpenAIChatCompletionsTokenClient): def __init__(self, full_prompt_ids: list[int]) -> None: super().__init__(_NoopClient()) @@ -332,6 +369,38 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): assert body["cache_salt"] == "ckpt-1" +@pytest.mark.asyncio +async def test_post_dynamo_chat_uses_placeholder_messages(): + recording_client = _RecordingClient() + client = OpenAIChatCompletionsTokenClient(recording_client) + + await client._post_dynamo_chat( + prompt=cast(Any, [{"role": "user", "content": "real prompt"}]), + prompt_ids=[1, 2, 3], + model="test-model", + tools=None, + sampling_args={"extra_body": {"nvext": {"extra_fields": ["engine_data"]}}}, + extra_headers=None, + ) + + assert recording_client.calls[0]["body"]["messages"] == [ + {"role": "user", "content": ""} + ] + + +@pytest.mark.asyncio +async def test_sidecar_helper_reattaches_dynamo_engine_routed_experts(): + response = await post_chat_completion_with_routed_experts_sidecar( + _DynamoRoutedExpertsClient(), + "/chat/completions", + body={}, + ) + + routed = response.model_extra["nvext"]["engine_data"]["routed_experts"] + assert isinstance(routed["data"], memoryview) + assert routed["data"].tobytes() == base64.b64encode(b"abc") + + @pytest.mark.asyncio async def test_graft_engine_data_synthesizes_logprobs_when_content_less(): """engine_data.completion_logprobs must be grafted even when the choice diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 36fd9f08cb..7f9098550f 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -298,7 +298,7 @@ async def _post_dynamo_chat( body: dict[str, Any] = { "model": model, - "messages": prompt, # placeholder; engine ignores when token_data present + "messages": [{"role": "user", "content": ""}], "stream": False, "nvext": nvext, } diff --git a/verifiers/utils/client_utils.py b/verifiers/utils/client_utils.py index d1c9b62e2a..c718c9e0ab 100644 --- a/verifiers/utils/client_utils.py +++ b/verifiers/utils/client_utils.py @@ -101,6 +101,27 @@ async def post_chat_completion_with_routed_experts_sidecar( body: dict[str, Any], extra_headers: Mapping[str, str] | None = None, ) -> ChatCompletion: + def _routed_experts_container(response: ChatCompletion) -> dict[str, Any] | None: + """Return the parsed routed_experts dict, wherever the backend put it.""" + candidates: list[Any] = [] + if response.choices: + choice_extra = response.choices[0].model_extra or {} + if isinstance(choice_extra, dict): + candidates.append(choice_extra.get("routed_experts")) + + top_extra = response.model_extra or {} + nvext = top_extra.get("nvext") if isinstance(top_extra, dict) else None + if isinstance(nvext, dict): + candidates.append(nvext.get("routed_experts")) + engine_data = nvext.get("engine_data") + if isinstance(engine_data, dict): + candidates.append(engine_data.get("routed_experts")) + + for candidate in candidates: + if isinstance(candidate, dict): + return candidate + return None + raw_response = await client.post( path, body=body, @@ -110,9 +131,13 @@ async def post_chat_completion_with_routed_experts_sidecar( stripped, routed_data = strip_routed_experts_data(raw_response.content) response = ChatCompletion.model_validate_json(stripped) if routed_data is not None: - choice_extra = response.choices[0].model_extra - assert choice_extra is not None - choice_extra["routed_experts"]["data"] = routed_data + routed_experts = _routed_experts_container(response) + if routed_experts is None: + raise RuntimeError( + "routed_experts data was stripped from the raw response, but no " + "parsed routed_experts object was found to reattach it." + ) + routed_experts["data"] = routed_data return response From 59a01fab8684f19adadfc2e8c71862e1da88805d Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Fri, 12 Jun 2026 02:14:49 -0700 Subject: [PATCH 18/19] fix(dynamo): retain routed experts and document transport --- docs/reference.md | 6 ++ docs/training.md | 2 + ...st_openai_chat_completions_token_client.py | 56 +++++++++++++++++++ .../clients/openai_chat_completions_client.py | 11 +++- 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index 26b02f258d..e17c30f04c 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1078,6 +1078,8 @@ with `MyConfig.model_validate(...)` or use the typed object directly. class ClientConfig(BaseModel): client_idx: int = 0 client_type: ClientType = "openai_chat_completions" + renderer_transport: RendererTransport = "vllm_generate" + renderer_model_name: str | None = None preserve_all_thinking: bool = False preserve_thinking_between_tool_calls: bool = False api_key_var: str = "PRIME_API_KEY" @@ -1096,6 +1098,10 @@ class ClientConfig(BaseModel): `client_type` selects which `Client` implementation to instantiate (see [Client Classes](#client-classes)). Use `endpoint_configs` for multi-endpoint round-robin. In grouped scoring mode, groups are distributed round-robin across endpoint configs. +`renderer_transport` selects the token-in/token-out wire format used by `client_type == "openai_chat_completions_token"` and `client_type == "renderer"`. The default `"vllm_generate"` uses vLLM's token routes. Set `"dynamo_chat"` for Dynamo backends that accept pre-tokenized prompts in `nvext.token_data` on `/v1/chat/completions` and return token IDs in `nvext.engine_data`. + +`renderer_model_name` overrides the tokenizer/renderer model name used for local bridge tokenization and renderer construction. It is useful when the served API model name is an alias but the tokenizer should be loaded from the underlying Hugging Face model. + `preserve_all_thinking` and `preserve_thinking_between_tool_calls` are forwarded to the underlying renderer when `client_type == "renderer"`. They control whether past-assistant `reasoning_content` is re-emitted on subsequent renders — `preserve_all_thinking` keeps every past-assistant turn's thinking, and `preserve_thinking_between_tool_calls` keeps thinking only inside the in-flight assistant→tool→…→assistant block after the most recent user turn (when that block contains at least one tool response). Both default to `False` (template default applies). When `api_key_var` is `"PRIME_API_KEY"` (the default), credentials are loaded with the following precedence: diff --git a/docs/training.md b/docs/training.md index 655d057c99..f9c82b99d3 100644 --- a/docs/training.md +++ b/docs/training.md @@ -214,6 +214,8 @@ The rollout client's `client_type` controls how prompt assembly and token state - **`openai_chat_completions_token`** (TITO, *token-in*): server-side templating, but returns prompt and completion token IDs alongside text so the trainer doesn't re-tokenize. Use when you trust the server's chat template to be stable across turns. - **`renderer`** *(experimental)*: client-side tokenization via a per-model renderer in the [`renderers` package](https://github.com/PrimeIntellect-ai/verifiers/tree/main/packages/renderers). Install it with `uv add "verifiers[renderers]"` before using `client_type="renderer"`. The trainer renders messages to token IDs locally and sends those to vLLM's `/v1/generate` endpoint. The renderer's `bridge_to_next_turn` extends prior-turn tokens verbatim across multi-turn boundaries (the *extension property*) and synthesizes the canonical turn-close on mid-completion truncation, so multi-turn rollouts merge into one training sample with one clean loss mask. +`openai_chat_completions_token` defaults to `renderer_transport="vllm_generate"`, which uses vLLM token routes. For Dynamo inference backends, set `renderer_transport="dynamo_chat"` so multi-turn rollouts send the stitched prompt in `nvext.token_data` on `/v1/chat/completions` and read server token IDs from `nvext.engine_data`. In `prime-rl`, this is normally selected for you when `client.backend = "dynamo"`. + For production RL training, use `openai_chat_completions_token` — it's the tried-and-tested path with broad model coverage. The `renderer` client is newer and offers stronger token-preservation guarantees in theory, but is experimental: hand-coded renderers exist only for a subset of models, and corner cases are still being shaken out. See [reference § Built-in Clients](reference.md#built-in-client-implementations) for the full list. ### Common Issues diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index b442880f75..d0469fc43c 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -438,3 +438,59 @@ async def test_graft_engine_data_synthesizes_logprobs_when_content_less(): assert tokens.completion_ids == [10, 11] assert tokens.prompt_ids == [1, 2, 3] assert tokens.completion_logprobs == [-0.1, -0.2] + + +@pytest.mark.asyncio +async def test_parse_tokens_reads_dynamo_engine_routed_experts(): + from openai.types.chat import ChatCompletion + + client = OpenAIChatCompletionsClient(_NoopClient()) + native = ChatCompletion.model_validate( + { + "id": "x", + "object": "chat.completion", + "created": 1, + "model": "test-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + "logprobs": { + "content": [ + { + "token": "ok", + "logprob": -0.1, + "bytes": [111, 107], + "top_logprobs": [], + } + ] + }, + } + ], + "nvext": { + "engine_data": { + "completion_token_ids": [10], + "prompt_token_ids": [1, 2, 3], + "completion_logprobs": [-0.1], + "routed_experts": { + "data": "QUJD", + "shape": [3, 1, 1], + "start": 0, + "dtype": "uint8", + }, + } + }, + } + ) + + vf_response = await client.from_native_response(native) + tokens = vf_response.message.tokens + + assert tokens is not None + assert tokens.routed_experts == { + "data": "QUJD", + "shape": [3, 1, 1], + "start": 0, + "dtype": "uint8", + } diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py index 0246b9f669..a1f36a401b 100644 --- a/verifiers/clients/openai_chat_completions_client.py +++ b/verifiers/clients/openai_chat_completions_client.py @@ -606,13 +606,22 @@ def parse_tokens(response: OpenAIChatResponse) -> ResponseTokens | None: return None choice_extra = choice.model_extra or {} + routed_experts = choice_extra.get("routed_experts") + if routed_experts is None: + top_extra = response.model_extra or {} + nvext = top_extra.get("nvext") if isinstance(top_extra, dict) else None + if isinstance(nvext, dict): + routed_experts = nvext.get("routed_experts") + engine_data = nvext.get("engine_data") + if routed_experts is None and isinstance(engine_data, dict): + routed_experts = engine_data.get("routed_experts") return ResponseTokens( prompt_ids=prompt_ids, prompt_mask=prompt_mask, completion_ids=completion_ids, completion_mask=completion_mask, completion_logprobs=completion_logprobs, - routed_experts=choice_extra.get("routed_experts"), + routed_experts=routed_experts, ) response_id = getattr(response, "id", "") From b6588838ec9f03275c6d9f6f95b69d12632d815b Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Fri, 12 Jun 2026 02:53:07 -0700 Subject: [PATCH 19/19] chore(client): rename renderer transport values --- docs/reference.md | 4 +-- docs/training.md | 2 +- ...st_openai_chat_completions_token_client.py | 10 +++--- .../openai_chat_completions_token_client.py | 36 +++++++++---------- verifiers/clients/renderer_client.py | 6 ++-- verifiers/types.py | 8 ++--- 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index e17c30f04c..e4a27f3617 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1078,7 +1078,7 @@ with `MyConfig.model_validate(...)` or use the typed object directly. class ClientConfig(BaseModel): client_idx: int = 0 client_type: ClientType = "openai_chat_completions" - renderer_transport: RendererTransport = "vllm_generate" + renderer_transport: RendererTransport = "vllm" renderer_model_name: str | None = None preserve_all_thinking: bool = False preserve_thinking_between_tool_calls: bool = False @@ -1098,7 +1098,7 @@ class ClientConfig(BaseModel): `client_type` selects which `Client` implementation to instantiate (see [Client Classes](#client-classes)). Use `endpoint_configs` for multi-endpoint round-robin. In grouped scoring mode, groups are distributed round-robin across endpoint configs. -`renderer_transport` selects the token-in/token-out wire format used by `client_type == "openai_chat_completions_token"` and `client_type == "renderer"`. The default `"vllm_generate"` uses vLLM's token routes. Set `"dynamo_chat"` for Dynamo backends that accept pre-tokenized prompts in `nvext.token_data` on `/v1/chat/completions` and return token IDs in `nvext.engine_data`. +`renderer_transport` selects the token-in/token-out wire format used by `client_type == "openai_chat_completions_token"` and `client_type == "renderer"`. The default `"vllm"` uses vLLM's token routes. Set `"dynamo"` for Dynamo backends that accept pre-tokenized prompts in `nvext.token_data` on `/v1/chat/completions` and return token IDs in `nvext.engine_data`. `renderer_model_name` overrides the tokenizer/renderer model name used for local bridge tokenization and renderer construction. It is useful when the served API model name is an alias but the tokenizer should be loaded from the underlying Hugging Face model. diff --git a/docs/training.md b/docs/training.md index f9c82b99d3..dcc018fff1 100644 --- a/docs/training.md +++ b/docs/training.md @@ -214,7 +214,7 @@ The rollout client's `client_type` controls how prompt assembly and token state - **`openai_chat_completions_token`** (TITO, *token-in*): server-side templating, but returns prompt and completion token IDs alongside text so the trainer doesn't re-tokenize. Use when you trust the server's chat template to be stable across turns. - **`renderer`** *(experimental)*: client-side tokenization via a per-model renderer in the [`renderers` package](https://github.com/PrimeIntellect-ai/verifiers/tree/main/packages/renderers). Install it with `uv add "verifiers[renderers]"` before using `client_type="renderer"`. The trainer renders messages to token IDs locally and sends those to vLLM's `/v1/generate` endpoint. The renderer's `bridge_to_next_turn` extends prior-turn tokens verbatim across multi-turn boundaries (the *extension property*) and synthesizes the canonical turn-close on mid-completion truncation, so multi-turn rollouts merge into one training sample with one clean loss mask. -`openai_chat_completions_token` defaults to `renderer_transport="vllm_generate"`, which uses vLLM token routes. For Dynamo inference backends, set `renderer_transport="dynamo_chat"` so multi-turn rollouts send the stitched prompt in `nvext.token_data` on `/v1/chat/completions` and read server token IDs from `nvext.engine_data`. In `prime-rl`, this is normally selected for you when `client.backend = "dynamo"`. +`openai_chat_completions_token` defaults to `renderer_transport="vllm"`, which uses vLLM token routes. For Dynamo inference backends, set `renderer_transport="dynamo"` so multi-turn rollouts send the stitched prompt in `nvext.token_data` on `/v1/chat/completions` and read server token IDs from `nvext.engine_data`. In `prime-rl`, this is normally selected for you when `client.backend = "dynamo"`. For production RL training, use `openai_chat_completions_token` — it's the tried-and-tested path with broad model coverage. The `renderer` client is newer and offers stronger token-preservation guarantees in theory, but is experimental: hand-coded renderers exist only for a subset of models, and corner cases are still being shaken out. See [reference § Built-in Clients](reference.md#built-in-client-implementations) for the full list. diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py index d0469fc43c..248a2b7d3f 100644 --- a/tests/test_openai_chat_completions_token_client.py +++ b/tests/test_openai_chat_completions_token_client.py @@ -333,13 +333,13 @@ async def fake_get_prompt_ids( # noqa: ANN001 @pytest.mark.asyncio -async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): - """dynamo_chat wire body: vLLM-only keys scrubbed, standard sampling args +async def test_post_dynamo_scrubs_vllm_only_and_forwards_sampling(): + """dynamo wire body: vLLM-only keys scrubbed, standard sampling args forwarded, nvext token_data + passthrough preserved.""" recording_client = _RecordingClient() client = OpenAIChatCompletionsTokenClient(recording_client) - await client._post_dynamo_chat( + await client._post_dynamo( prompt=cast(Any, [{"role": "user", "content": ""}]), prompt_ids=[1, 2, 3], model="test-model", @@ -370,11 +370,11 @@ async def test_post_dynamo_chat_scrubs_vllm_only_and_forwards_sampling(): @pytest.mark.asyncio -async def test_post_dynamo_chat_uses_placeholder_messages(): +async def test_post_dynamo_uses_placeholder_messages(): recording_client = _RecordingClient() client = OpenAIChatCompletionsTokenClient(recording_client) - await client._post_dynamo_chat( + await client._post_dynamo( prompt=cast(Any, [{"role": "user", "content": "real prompt"}]), prompt_ids=[1, 2, 3], model="test-model", diff --git a/verifiers/clients/openai_chat_completions_token_client.py b/verifiers/clients/openai_chat_completions_token_client.py index 7f9098550f..44f7ce7ba5 100644 --- a/verifiers/clients/openai_chat_completions_token_client.py +++ b/verifiers/clients/openai_chat_completions_token_client.py @@ -25,10 +25,10 @@ # Sentinel for the default (legacy vLLM) transport. Lets callers route # around the legacy /tokenize body shape without changing the signature. -_DEFAULT_TRANSPORT: RendererTransport = "vllm_generate" +_DEFAULT_TRANSPORT: RendererTransport = "vllm" # vLLM/prime-only sampling keys Dynamo's strict validator rejects — scrubbed -# from every dynamo_chat request body (both MITO and TITO paths). +# from every dynamo request body (both MITO and TITO paths). _DYNAMO_DROP_KEYS = frozenset( {"return_token_ids", "spaces_between_special_tokens", "priority"} ) @@ -66,12 +66,12 @@ class OpenAIChatCompletionsTokenClient(OpenAIChatCompletionsClient): Two transports share this class, selected via ``ClientConfig.renderer_transport``: - * ``vllm_generate`` (default): vLLM's TITO surface. + * ``vllm`` (default): vLLM's TITO surface. Posts to ``/v1/chat/completions/tokens`` with ``tokens=prompt_ids`` and uses the server's ``/tokenize`` endpoint for bridge tokens. Requires vLLM ``>=0.20``. - * ``dynamo_chat``: Dynamo's standard ``/v1/chat/completions`` + * ``dynamo``: Dynamo's standard ``/v1/chat/completions`` route with ``nvext.token_data=prompt_ids``. Server-side response token IDs come back via ``response.nvext.engine_data.*`` (`OpenAIChatCompletionsClient.from_native_response` grafts them @@ -102,7 +102,7 @@ def renderer_transport(self) -> RendererTransport: ) def _get_local_tokenizer(self, model: str): - """Lazy, per-model HF fast tokenizer for the ``dynamo_chat`` + """Lazy, per-model HF fast tokenizer for the ``dynamo`` transport. Bridge tokens are stitched locally — no ``/tokenize`` round-trip. Cached so we pay the ``AutoTokenizer.from_pretrained`` cost once. @@ -123,7 +123,7 @@ def _get_local_tokenizer(self, model: str): except ImportError as exc: # pragma: no cover - dependency surface raise ImportError( "OpenAIChatCompletionsTokenClient with " - "renderer_transport='dynamo_chat' requires " + "renderer_transport='dynamo' requires " "`transformers`. Install with `pip install transformers`." ) from exc cache[model] = AutoTokenizer.from_pretrained(model) @@ -147,19 +147,19 @@ def normalize_sampling_args(sampling_args: SamplingArgs): # Transport-specific opt-ins. Both transports get response-side # token IDs, just via different fields: # - # * vllm_generate (vLLM): `extra_body.return_token_ids=True` + # * vllm (vLLM): `extra_body.return_token_ids=True` # tells vLLM to set the non-standard `choices[0].token_ids` and # `response.prompt_token_ids` fields. `parse_tokens` reads them # directly. # - # * dynamo_chat: `nvext.extra_fields=["engine_data"]` + # * dynamo: `nvext.extra_fields=["engine_data"]` # tells Dynamo's response builder to emit `response.nvext` # `engine_data.{completion_token_ids, completion_logprobs, # prompt_token_ids}`. `from_native_response` grafts # this onto the OpenAI-shaped response so `parse_tokens` # works unmodified. `return_token_ids` is dropped because # Dynamo's strict validator rejects it. - if self.renderer_transport == "dynamo_chat": + if self.renderer_transport == "dynamo": extra_body: dict[str, Any] = { "nvext": {"extra_fields": ["engine_data"]} } @@ -187,7 +187,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): sampling_args["extra_body"] = {**merged, **extra_body} else: sampling_args["extra_body"] = extra_body - if self.renderer_transport == "dynamo_chat": + if self.renderer_transport == "dynamo": # Drop vLLM/prime-only keys Dynamo rejects from both top-level # args and extra_body, so MITO + TITO paths send a clean body. eb = sampling_args.get("extra_body") @@ -239,8 +239,8 @@ def normalize_sampling_args(sampling_args: SamplingArgs): prompt, model, sampling_args, tools, extra_headers=extra_headers ) - if self.renderer_transport == "dynamo_chat": - return await self._post_dynamo_chat( + if self.renderer_transport == "dynamo": + return await self._post_dynamo( prompt=prompt, prompt_ids=prompt_ids, model=model, @@ -266,7 +266,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs): extra_headers=extra_headers, ) - async def _post_dynamo_chat( + async def _post_dynamo( self, prompt: OpenAIChatMessages, prompt_ids: list[int], @@ -317,7 +317,7 @@ async def _post_dynamo_chat( body[key] = value # Use the sidecar-aware post (same as the vLLM TITO + MITO paths) so any - # routed_experts blob is streamed, not JSON-parsed. dynamo_chat opts into + # routed_experts blob is streamed, not JSON-parsed. dynamo opts into # extra_fields=["engine_data"] only, so routed_experts is normally absent. return await post_chat_completion_with_routed_experts_sidecar( self.client, @@ -565,8 +565,8 @@ async def tokenize( Dispatched by ``renderer_transport``: - * ``vllm_generate`` (default): POST to vLLM's ``/tokenize`` route. - * ``dynamo_chat``: local HF fast-tokenizer call. Dynamo doesn't + * ``vllm`` (default): POST to vLLM's ``/tokenize`` route. + * ``dynamo``: local HF fast-tokenizer call. Dynamo doesn't expose ``/tokenize``; running locally also saves two HTTP RTTs per turn (the bridge computes both ``add_generation_prompt=True`` and ``False`` views). The HF Rust encode releases the GIL so the @@ -575,7 +575,7 @@ async def tokenize( if extra_kwargs is None: extra_kwargs = {} - if self.renderer_transport == "dynamo_chat": + if self.renderer_transport == "dynamo": return await self._local_tokenize( messages=messages, tools=tools, @@ -611,7 +611,7 @@ async def _local_tokenize( model: str, extra_kwargs: dict, ) -> list[int]: - """Local in-process tokenization for the ``dynamo_chat`` transport. + """Local in-process tokenization for the ``dynamo`` transport. Bridge tokenization under TITO calls this twice per turn (once for ``add_generation_prompt=True`` and once for ``False``). Both runs diff --git a/verifiers/clients/renderer_client.py b/verifiers/clients/renderer_client.py index cc0acd3556..af372bf9ff 100644 --- a/verifiers/clients/renderer_client.py +++ b/verifiers/clients/renderer_client.py @@ -606,18 +606,18 @@ async def get_native_response( # Thread renderer_transport from ClientConfig into generate() so the # renderer client works against Dynamo's /v1/chat/completions surface # as well as vLLM's /inference/v1/generate. setup_clients auto-picks - # "dynamo_chat" when client_config.backend == "dynamo". + # "dynamo" when client_config.backend == "dynamo". # ``renderers.client.generate`` raises ``renderers.OverlongPromptError`` # on pre-flight overflow; rebadge to verifiers-native so MultiTurnEnv stops. transport = ( self._config.renderer_transport if self._config is not None - else "vllm_generate" + else "vllm" ) # Only pass transport= when non-default: a pinned ``renderers`` may # predate the kwarg, so the default path must use the upstream signature. generate_kwargs: dict[str, Any] = {} - if transport != "vllm_generate": + if transport != "vllm": generate_kwargs["transport"] = transport try: return await generate( diff --git a/verifiers/types.py b/verifiers/types.py index fd511603c3..484ab3b9ed 100644 --- a/verifiers/types.py +++ b/verifiers/types.py @@ -83,17 +83,17 @@ # client targets at request-build time. Same flag drives both clients so a # single `ClientConfig.renderer_transport` setting routes consistently. # -# - "vllm_generate" (default): vLLM's TITO surface. For RendererClient +# - "vllm" (default): vLLM's TITO surface. For RendererClient # that's POST /v1/chat/completions with a renderer-flavored request body. # For OpenAIChatCompletionsTokenClient that's POST # /v1/chat/completions/tokens with `tokens=prompt_ids` and bridge # tokenization via the server's /tokenize route. -# - "dynamo_chat": Dynamo's standard chat-completions route with +# - "dynamo": Dynamo's standard chat-completions route with # pre-tokenized prompt carried in `nvext.token_data`. Server-side token # IDs come back via `nvext.engine_data.completion_token_ids` (the # canonical Dynamo channel). Bridge tokenization runs locally via the # transformers fast tokenizer; no /tokenize HTTP round-trip. -RendererTransport = Literal["vllm_generate", "dynamo_chat"] +RendererTransport = Literal["vllm", "dynamo"] # Provider-agnostic message + response types @@ -1290,7 +1290,7 @@ class ClientConfig(BaseModel): Drives the renderer pool when ``client_type == "renderer"``. Defaults to ``None`` so non-renderer clients aren't forced to declare it; the renderer client treats ``None`` as ``AutoRendererConfig()``.""" - renderer_transport: RendererTransport = "vllm_generate" + renderer_transport: RendererTransport = "vllm" renderer_model_name: str | None = None """Override the tokenizer model name used to instantiate the renderer pool. Defaults to the model used in API requests."""