PrimeIntellect-ai · biswapanda · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 14, 2026
diff --git a/docs/reference.md b/docs/reference.md
@@ -215,6 +215,7 @@ class RoutedExpertsPayload(TypedDict):
     data: Any  # actually memoryview; kept opaque so Pydantic skips schema validation
     shape: list[int]
     start: int
+    dtype: NotRequired[Literal["uint8", "uint16", "int16", "int32"]]  # optional; absent → uint8
 ```
 
 ### TrajectoryStepTokens
@@ -1077,6 +1078,8 @@ with `MyConfig.model_validate(...)` or use the typed object directly.
 class ClientConfig(BaseModel):
     client_idx: int = 0
     client_type: ClientType = "openai_chat_completions"
+    renderer_transport: RendererTransport = "vllm"
+    renderer_model_name: str | None = None
     preserve_all_thinking: bool = False
     preserve_thinking_between_tool_calls: bool = False
     api_key_var: str = "PRIME_API_KEY"
@@ -1095,6 +1098,10 @@ class ClientConfig(BaseModel):
 
 `client_type` selects which `Client` implementation to instantiate (see [Client Classes](#client-classes)). Use `endpoint_configs` for multi-endpoint round-robin. In grouped scoring mode, groups are distributed round-robin across endpoint configs.
 
+`renderer_transport` selects the token-in/token-out wire format used by `client_type == "openai_chat_completions_token"` and `client_type == "renderer"`. The default `"vllm"` uses vLLM's token routes. Set `"dynamo"` for Dynamo backends that accept pre-tokenized prompts in `nvext.token_data` on `/v1/chat/completions` and return token IDs in `nvext.engine_data`.
+
+`renderer_model_name` overrides the tokenizer/renderer model name used for local bridge tokenization and renderer construction. It is useful when the served API model name is an alias but the tokenizer should be loaded from the underlying Hugging Face model.
+
 `preserve_all_thinking` and `preserve_thinking_between_tool_calls` are forwarded to the underlying renderer when `client_type == "renderer"`. They control whether past-assistant `reasoning_content` is re-emitted on subsequent renders — `preserve_all_thinking` keeps every past-assistant turn's thinking, and `preserve_thinking_between_tool_calls` keeps thinking only inside the in-flight assistant→tool→…→assistant block after the most recent user turn (when that block contains at least one tool response). Both default to `False` (template default applies).
 
 When `api_key_var` is `"PRIME_API_KEY"` (the default), credentials are loaded with the following precedence:

diff --git a/docs/training.md b/docs/training.md
@@ -214,6 +214,8 @@ The rollout client's `client_type` controls how prompt assembly and token state
 - **`openai_chat_completions_token`** (TITO, *token-in*): server-side templating, but returns prompt and completion token IDs alongside text so the trainer doesn't re-tokenize. Use when you trust the server's chat template to be stable across turns.
 - **`renderer`** *(experimental)*: client-side tokenization via a per-model renderer in the [`renderers` package](https://github.com/PrimeIntellect-ai/verifiers/tree/main/packages/renderers). Install it with `uv add "verifiers[renderers]"` before using `client_type="renderer"`. The trainer renders messages to token IDs locally and sends those to vLLM's `/v1/generate` endpoint. The renderer's `bridge_to_next_turn` extends prior-turn tokens verbatim across multi-turn boundaries (the *extension property*) and synthesizes the canonical turn-close on mid-completion truncation, so multi-turn rollouts merge into one training sample with one clean loss mask.
 
+`openai_chat_completions_token` defaults to `renderer_transport="vllm"`, which uses vLLM token routes. For Dynamo inference backends, set `renderer_transport="dynamo"` so multi-turn rollouts send the stitched prompt in `nvext.token_data` on `/v1/chat/completions` and read server token IDs from `nvext.engine_data`. In `prime-rl`, this is normally selected for you when `client.backend = "dynamo"`.
+
 For production RL training, use `openai_chat_completions_token` — it's the tried-and-tested path with broad model coverage. The `renderer` client is newer and offers stronger token-preservation guarantees in theory, but is experimental: hand-coded renderers exist only for a subset of models, and corner cases are still being shaken out. See [reference § Built-in Clients](reference.md#built-in-client-implementations) for the full list.
 
 ### Common Issues

diff --git a/tests/test_openai_chat_completions_token_client.py b/tests/test_openai_chat_completions_token_client.py
@@ -1,3 +1,5 @@
+import base64
+import json
 from typing import Any, cast
 
 import httpx
@@ -8,6 +10,7 @@
     OpenAIChatCompletionsTokenClient,
 )
 from verifiers.types import State
+from verifiers.utils.client_utils import post_chat_completion_with_routed_experts_sidecar
 
 
 class _NoopClient:
@@ -46,6 +49,40 @@ async def post(
         )
 
 
+class _DynamoRoutedExpertsClient(_NoopClient):
+    async def post(
+        self, path: str, body: dict[str, Any], cast_to: type, **kwargs: Any
+    ) -> Any:
+        payload = {
+            "id": "x",
+            "object": "chat.completion",
+            "created": 1,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "ok"},
+                    "finish_reason": "stop",
+                }
+            ],
+            "nvext": {
+                "engine_data": {
+                    "completion_token_ids": [10],
+                    "routed_experts": {
+                        "data": base64.b64encode(b"abc").decode("ascii"),
+                        "shape": [3, 1, 1],
+                        "start": 0,
+                        "dtype": "uint8",
+                    },
+                }
+            },
+        }
+        return httpx.Response(
+            200,
+            content=json.dumps(payload, separators=(",", ":")).encode("utf-8"),
+        )
+
+
 class _PromptIdTestClient(OpenAIChatCompletionsTokenClient):
     def __init__(self, full_prompt_ids: list[int]) -> None:
         super().__init__(_NoopClient())
@@ -293,3 +330,167 @@ async def fake_get_prompt_ids(  # noqa: ANN001
     assert len(recording_client.calls) == 1
     assert recording_client.calls[0]["path"] == "/chat/completions/tokens"
     assert recording_client.calls[0]["body"]["tokens"] == [10, 20]
+
+
+@pytest.mark.asyncio
+async def test_post_dynamo_scrubs_vllm_only_and_forwards_sampling():
+    """dynamo wire body: vLLM-only keys scrubbed, standard sampling args
+    forwarded, nvext token_data + passthrough preserved."""
+    recording_client = _RecordingClient()
+    client = OpenAIChatCompletionsTokenClient(recording_client)
+
+    await client._post_dynamo(
+        prompt=cast(Any, [{"role": "user", "content": ""}]),
+        prompt_ids=[1, 2, 3],
+        model="test-model",
+        tools=None,
+        sampling_args={
+            "temperature": 0.5,
+            "presence_penalty": 0.2,
+            "reasoning_effort": "high",  # arbitrary key: full parity, not an allowlist
+            "spaces_between_special_tokens": False,  # vLLM-only — must be scrubbed
+            "extra_body": {
+                "return_token_ids": True,  # vLLM-only — must be scrubbed
+                "nvext": {"extra_fields": ["engine_data"]},
+                "cache_salt": "ckpt-1",
+            },
+        },
+        extra_headers=None,
+    )
+
+    body = recording_client.calls[0]["body"]
+    assert "return_token_ids" not in body
+    assert "spaces_between_special_tokens" not in body
+    assert body["presence_penalty"] == 0.2
+    assert body["temperature"] == 0.5
+    assert body["reasoning_effort"] == "high"
+    assert body["nvext"]["token_data"] == [1, 2, 3]
+    assert body["nvext"]["extra_fields"] == ["engine_data"]
+    assert body["cache_salt"] == "ckpt-1"
+
+
+@pytest.mark.asyncio
+async def test_post_dynamo_uses_placeholder_messages():
+    recording_client = _RecordingClient()
+    client = OpenAIChatCompletionsTokenClient(recording_client)
+
+    await client._post_dynamo(
+        prompt=cast(Any, [{"role": "user", "content": "real prompt"}]),
+        prompt_ids=[1, 2, 3],
+        model="test-model",
+        tools=None,
+        sampling_args={"extra_body": {"nvext": {"extra_fields": ["engine_data"]}}},
+        extra_headers=None,
+    )
+
+    assert recording_client.calls[0]["body"]["messages"] == [
+        {"role": "user", "content": ""}
+    ]
+
+
+@pytest.mark.asyncio
+async def test_sidecar_helper_reattaches_dynamo_engine_routed_experts():
+    response = await post_chat_completion_with_routed_experts_sidecar(
+        _DynamoRoutedExpertsClient(),
+        "/chat/completions",
+        body={},
+    )
+
+    routed = response.model_extra["nvext"]["engine_data"]["routed_experts"]
+    assert isinstance(routed["data"], memoryview)
+    assert routed["data"].tobytes() == base64.b64encode(b"abc")
+
+
+@pytest.mark.asyncio
+async def test_graft_engine_data_synthesizes_logprobs_when_content_less():
+    """engine_data.completion_logprobs must be grafted even when the choice
+    carries a content-less logprobs object (not only when absent)."""
+    from openai.types.chat import ChatCompletion
+
+    client = OpenAIChatCompletionsClient(_NoopClient())
+    native = ChatCompletion.model_validate(
+        {
+            "id": "x",
+            "object": "chat.completion",
+            "created": 1,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "ok"},
+                    "finish_reason": "stop",
+                    "logprobs": {"content": None},  # present but content-less
+                }
+            ],
+            "nvext": {
+                "engine_data": {
+                    "completion_token_ids": [10, 11],
+                    "prompt_token_ids": [1, 2, 3],
+                    "completion_logprobs": [-0.1, -0.2],
+                }
+            },
+        }
+    )
+
+    vf_response = await client.from_native_response(native)
+    tokens = vf_response.message.tokens
+    assert tokens is not None  # would be None before the fix (TITO lost)
+    assert tokens.completion_ids == [10, 11]
+    assert tokens.prompt_ids == [1, 2, 3]
+    assert tokens.completion_logprobs == [-0.1, -0.2]
+
+
+@pytest.mark.asyncio
+async def test_parse_tokens_reads_dynamo_engine_routed_experts():
+    from openai.types.chat import ChatCompletion
+
+    client = OpenAIChatCompletionsClient(_NoopClient())
+    native = ChatCompletion.model_validate(
+        {
+            "id": "x",
+            "object": "chat.completion",
+            "created": 1,
+            "model": "test-model",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "ok"},
+                    "finish_reason": "stop",
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": "ok",
+                                "logprob": -0.1,
+                                "bytes": [111, 107],
+                                "top_logprobs": [],
+                            }
+                        ]
+                    },
+                }
+            ],
+            "nvext": {
+                "engine_data": {
+                    "completion_token_ids": [10],
+                    "prompt_token_ids": [1, 2, 3],
+                    "completion_logprobs": [-0.1],
+                    "routed_experts": {
+                        "data": "QUJD",
+                        "shape": [3, 1, 1],
+                        "start": 0,
+                        "dtype": "uint8",
+                    },
+                }
+            },
+        }
+    )
+
+    vf_response = await client.from_native_response(native)
+    tokens = vf_response.message.tokens
+
+    assert tokens is not None
+    assert tokens.routed_experts == {
+        "data": "QUJD",
+        "shape": [3, 1, 1],
+        "start": 0,
+        "dtype": "uint8",
+    }
diff --git a/tests/test_trajectory_processing.py b/tests/test_trajectory_processing.py
@@ -483,3 +483,32 @@ def test_trajectory_step_mask_combining():
     assert token_ids == [1, 2, 3, 4, 5]
     assert mask == [0, 0, 0, 1, 1]
     assert logprobs == [0.0, 0.0, 0.0, -0.1, -0.2]
+
+
+def test_strip_routed_experts_data_key_order_robust():
+    """The zero-copy stripper must find ``data`` regardless of key order
+    (``dtype``/``shape``/``start`` may precede it) and no-op when absent."""
+    from verifiers.utils.response_utils import strip_routed_experts_data
+
+    # data first (fast path)
+    raw = b'{"routed_experts":{"data":"QUJD","shape":[3],"start":0,"dtype":"uint8"}}'
+    stripped, blob = strip_routed_experts_data(raw)
+    assert blob is not None and blob.tobytes() == b"QUJD"
+    assert b'"data":""' in stripped
+
+    # dtype/shape/start before data — must still strip the blob
+    raw2 = b'{"routed_experts":{"dtype":"uint16","shape":[3],"start":0,"data":"WFla"}}'
+    stripped2, blob2 = strip_routed_experts_data(raw2)
+    assert blob2 is not None and blob2.tobytes() == b"WFla"
+    assert b'"data":""' in stripped2
+
+    # routed_experts object lacks data; an unrelated sibling has data — must
+    # NOT be mistaken for routed experts (search bounded to the object).
+    raw4 = b'{"routed_experts":{"shape":[3],"start":0},"other":{"data":"UNRELATED"}}'
+    stripped4, blob4 = strip_routed_experts_data(raw4)
+    assert blob4 is None and stripped4 == raw4
+
+    # absent — no-op passthrough
+    raw3 = b'{"choices":[{"token_ids":[1,2]}]}'
+    stripped3, blob3 = strip_routed_experts_data(raw3)
+    assert blob3 is None and stripped3 == raw3