Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,6 @@ dependencies = [
# against 0.0.8) and ``tests/test_gpt_oss_harmony_parity.py`` passes on it,
# so the older harmony is safe.
"openai-harmony>=0.0.4",
# Crusoe's Rust BPE tokenizer; ~10x faster encode vs HF's tokenizers.
# ``load_tokenizer`` patches it in by default for every supported model
# except a small denylist (DeepSeek-V3 family). The patch is bracketed
# around ``from_pretrained``, so subsequent ``AutoTokenizer`` calls
# outside the renderers package stay vanilla.
"fastokens>=0.2.0",
# ``BaseRendererConfig`` inherits from ``pydantic_config.BaseConfig`` so
# the typed-config surface stays uniform with prime-rl / verifiers config
# bases. Transitively brings pydantic, which ``renderers.configs`` also
Expand Down Expand Up @@ -103,12 +97,6 @@ required-version = ">=0.11.1"
exclude-newer = "7 days"

[tool.uv.exclude-newer-package]
# fastokens 0.2.0 was published on 2026-05-17 and contains the
# ``unpatch_transformers`` fix (crusoecloud/fastokens#32) needed for
# MiniMax-M2's slow→fast tokenizer conversion path. Exempting it from
# the project-wide 7-day cutoff lets the lockfile pick it up immediately
# while the rest of the dependency graph stays gated.
fastokens = false
# PrimeIntellect-published packages in this project's dependency closure —
# fast-track so first-party releases can land same-day. Only packages that
# appear in `uv tree` are listed.
Expand Down
243 changes: 28 additions & 215 deletions renderers/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from __future__ import annotations

import contextlib
import enum
import io
import logging
import queue
import threading
Expand Down Expand Up @@ -1163,29 +1161,6 @@ def _model_has_vision_config(model_name: str) -> bool:
}


# Models for which ``fastokens`` is known to diverge from vanilla
# ``transformers.AutoTokenizer`` and therefore must NOT be patched.
# Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both
# backends. The entries below fail to load under fastokens (DeepSeek-V3
# family — Metaspace pretokenizer not yet implemented).
FASTOKENS_INCOMPATIBLE: frozenset[str] = frozenset(
{
# fastokens: ``ValueError: pre-tokenizer error: unsupported
# pre-tokenizer type: Metaspace`` — DeepSeek's tokenizer uses
# SentencePiece-style Metaspace pretokenization which fastokens
# doesn't yet implement.
"deepseek-ai/DeepSeek-V3",
"deepseek-ai/DeepSeek-V3-Base",
"deepseek-ai/DeepSeek-R1",
"deepseek-ai/DeepSeek-R1-0528",
}
)


_FASTOKENS_PATCH_LOCK = threading.Lock()
_FASTOKENS_ANNOUNCED = False


def _tokenizer_source_for(model_name_or_path: str) -> str:
return TOKENIZER_SOURCE_OVERRIDES.get(model_name_or_path, model_name_or_path)

Expand Down Expand Up @@ -1222,48 +1197,6 @@ def _preserve_requested_tokenizer_name(
return tokenizer


def _patched_load(model_name_or_path: str, **kwargs):
"""Run ``AutoTokenizer.from_pretrained`` with fastokens patched in
process-locally — patch around the load, unpatch right after.

fastokens captures the loaded backend on a per-tokenizer basis, so
after we unpatch the returned tokenizer object continues to use
fastokens for ``encode``/``decode`` while subsequent
``AutoTokenizer.from_pretrained`` calls (outside our control) go
back to vanilla. This keeps the global side effect minimal.

fastokens itself prints ``[fastokens] patch_transformers: ...`` to
stdout on every patch/unpatch call. Building a pool of size N would
therefore emit ~N lines (more under thread contention, where some
threads see ``already patched``). We swallow those prints under a
lock — ``contextlib.redirect_stdout`` swaps ``sys.stdout``
process-wide, so the lock keeps unrelated stdout writes from other
threads from disappearing into our buffer. The patch/unpatch calls
are cheap; only the brief patch+unpatch is serialized, the actual
``from_pretrained`` still runs concurrently across pool slots. A
single ``logger.info`` is emitted on the first patch so the fast
path is still discoverable in logs.
"""
import fastokens

global _FASTOKENS_ANNOUNCED

with _FASTOKENS_PATCH_LOCK:
with contextlib.redirect_stdout(io.StringIO()):
fastokens.patch_transformers()
if not _FASTOKENS_ANNOUNCED:
logger.info(
"fastokens enabled — tokenizers load through the Rust BPE fast path (~10x encode speedup)."
)
_FASTOKENS_ANNOUNCED = True
try:
return _load_tokenizer_via_auto(model_name_or_path, **kwargs)
finally:
with _FASTOKENS_PATCH_LOCK:
with contextlib.redirect_stdout(io.StringIO()):
fastokens.unpatch_transformers()


def _load_fast_tokenizer_directly(
model_name_or_path: str, revision: str | None
) -> Any | None:
Expand Down Expand Up @@ -1323,36 +1256,14 @@ def _load_tokenizer_via_auto(model_name_or_path: str, **kwargs) -> Any:
return tok


def load_tokenizer(
model_name_or_path: str,
*,
use_fastokens: bool = True,
):
"""Load a tokenizer with the renderers-package security + perf policy.
def load_tokenizer(model_name_or_path: str):
"""Load a tokenizer with the renderers-package security policy.

**Security** — default ``trust_remote_code=False``. Models listed in
Default ``trust_remote_code=False``. Models listed in
``TRUSTED_REVISIONS`` (Moonshot Kimi-K2 family) load with
``trust_remote_code=True`` AND a pinned ``revision=<sha>`` so
transformers only executes the reviewed commit's tokenizer Python.

**Performance** — ``use_fastokens=True`` (default) routes the load
through ``fastokens.patch_transformers()`` so the resulting tokenizer
encodes ~10x faster than vanilla ``tokenizers``. The patch is
bracketed: it's applied before ``from_pretrained`` and removed
immediately after, so global ``AutoTokenizer.from_pretrained`` calls
elsewhere in the user's process are not affected.

Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family) skip the
patch — fastokens currently fails to load them. Pass
``use_fastokens=False`` to force the vanilla backend for any other
model.

Unknown / fine-tuned model paths fall through to
``trust_remote_code=False`` and the patched-load fast path. If
fastokens raises during the patched load (e.g. an unknown
pre-tokenizer type), we automatically retry with the vanilla
backend and emit an INFO log.

``AutoTokenizer.from_pretrained`` eagerly builds the model config to
resolve the tokenizer class. If that construction raises on a
modeling-only concern the tokenizer doesn't need (e.g. RoPE
Expand All @@ -1367,28 +1278,7 @@ def load_tokenizer(
"""
load_name_or_path = _tokenizer_source_for(model_name_or_path)
kwargs = _tokenizer_load_kwargs(load_name_or_path)

if not use_fastokens or load_name_or_path in FASTOKENS_INCOMPATIBLE:
tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs)
return _preserve_requested_tokenizer_name(
tok,
requested_name_or_path=model_name_or_path,
loaded_name_or_path=load_name_or_path,
)

try:
tok = _patched_load(load_name_or_path, **kwargs)
except Exception as exc:
logger.info(
"fastokens could not load %r (%s: %s); falling back to vanilla "
"AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in "
"renderers.base to suppress the retry.",
load_name_or_path,
type(exc).__name__,
str(exc)[:160],
)
tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs)

tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs)
return _preserve_requested_tokenizer_name(
tok,
requested_name_or_path=model_name_or_path,
Expand Down Expand Up @@ -1718,104 +1608,28 @@ def trim_to_turn_close(
return previous_ids


# Per-model offset-aware tokenizer cache. ``attribute_text_segments``
# uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute
# each token to its source text segment under one BPE pass. Fastokens
# (the Rust BPE we patch in by default for ~10x faster encode) does not
# track character offsets — the patched tokenizer's
# ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we
# keep a parallel vanilla tokenizer per model purely for offset queries.
# Memory cost is one extra tokenizer per *unique* model name across all
# pools / renderers (the cache is process-global), independent of pool
# size.
_offset_tokenizers: dict[str, Any] = {}
_offset_tokenizers_lock = threading.Lock()


def _get_offset_tokenizer(tokenizer):
"""Return a tokenizer that supports ``return_offsets_mapping=True``.

If ``tokenizer`` itself supports offsets, returns it unchanged.
Otherwise loads a vanilla (non-fastokens) tokenizer from
``tokenizer.name_or_path`` and caches it. Raises if the tokenizer
has no usable ``name_or_path`` — hand-coded renderers always pass
a tokenizer loaded via ``load_tokenizer`` which does set it.
"""Assert ``tokenizer`` supports ``return_offsets_mapping=True``.

Hand-coded renderers concatenate scaffold + body in one BPE pass to
preserve cross-boundary merges, then attribute each resulting token
back to its source segment via the fast tokenizer's
``offset_mapping`` (see :func:`attribute_text_segments`). The
contract: every BYO tokenizer must be a fast tokenizer with offset
support. Tokenizers loaded via :func:`load_tokenizer` are
``PreTrainedTokenizerFast`` instances that satisfy this trivially.
"""
# Cheap probe: does this tokenizer already provide offsets?
try:
tokenizer("a", add_special_tokens=False, return_offsets_mapping=True)
return tokenizer
except (NotImplementedError, ValueError, TypeError):
pass

name_or_path = getattr(tokenizer, "name_or_path", "")
if not name_or_path:
except (NotImplementedError, ValueError, TypeError) as exc:
raise RuntimeError(
"Cannot construct an offset-aware tokenizer: the supplied "
"tokenizer has no ``name_or_path`` to fall back on. Pass a "
"tokenizer loaded via ``renderers.base.load_tokenizer``."
)

with _offset_tokenizers_lock:
cached = _offset_tokenizers.get(name_or_path)
if cached is not None:
return cached

load_name_or_path = _tokenizer_source_for(name_or_path)
kwargs = _tokenizer_load_kwargs(load_name_or_path)

def _has_offsets(tok) -> bool:
if not getattr(tok, "is_fast", False):
return False
try:
tok("a", add_special_tokens=False, return_offsets_mapping=True)
return True
except (NotImplementedError, ValueError, TypeError):
return False

# We want HF's Rust tokenizer with offset tracking, not the fastokens
# shim. The shim is installed by a *process-global* monkeypatch that
# ``load_tokenizer`` toggles per pool-slot load, so a plain reload here
# can race a concurrent slot's open patch window and silently pick up
# the offset-less shim (then get cached, poisoning the process). So:
# load, verify offsets, and if missing, reload with the patch forced
# off — serialized against pool patch/unpatch via ``_FASTOKENS_PATCH_LOCK``
# so no concurrent window can swap the shim back in mid-load — then
# restore the prior patch state. Never cache a non-offset tokenizer.
offset_tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs)
offset_tok = _preserve_requested_tokenizer_name(
offset_tok,
requested_name_or_path=name_or_path,
loaded_name_or_path=load_name_or_path,
)
if not _has_offsets(offset_tok):
import fastokens

with _FASTOKENS_PATCH_LOCK:
was_patched = bool(getattr(fastokens, "_patched", False))
if was_patched:
with contextlib.redirect_stdout(io.StringIO()):
fastokens.unpatch_transformers()
try:
offset_tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs)
offset_tok = _preserve_requested_tokenizer_name(
offset_tok,
requested_name_or_path=name_or_path,
loaded_name_or_path=load_name_or_path,
)
finally:
if was_patched:
with contextlib.redirect_stdout(io.StringIO()):
fastokens.patch_transformers()
if not _has_offsets(offset_tok):
raise RuntimeError(
f"Could not load an offset-capable tokenizer for {name_or_path!r}: "
"offset_mapping is unavailable even with the fastokens patch off. "
"Hand-coded renderers require a fast tokenizer for body/scaffold "
"attribution."
)
_offset_tokenizers[name_or_path] = offset_tok
return offset_tok
"Hand-coded renderers require a fast tokenizer with "
"``return_offsets_mapping=True`` support for body/scaffold "
"attribution. Pass a tokenizer loaded via "
"``renderers.base.load_tokenizer``, or any "
"``transformers.PreTrainedTokenizerFast`` instance."
) from exc
return tokenizer


def attribute_text_segments(
Expand All @@ -1839,14 +1653,13 @@ def attribute_text_segments(
tokens (rare; usually pre-tokenizer artefacts) are attributed to
the most recently entered segment.

Requires a HuggingFace fast tokenizer with offset tracking. The
``fastokens`` patch ``load_tokenizer`` applies by default does
**not** track offsets — when that's the case we transparently load
a vanilla offset-capable tokenizer for the same model and cache it
(see :func:`_get_offset_tokenizer`). Hand-coded renderers are only
registered for model families that ship a fast tokenizer, so a
silent slow-tokenizer fallback isn't supported — BPE drift at the
wrap/body boundary would defeat the whole point.
Requires a HuggingFace fast tokenizer with offset tracking. Every
model in ``MODEL_RENDERER_MAP`` ships one, so the offset lookup
always succeeds for tokenizers obtained via :func:`load_tokenizer`.
BYO tokenizers must be a ``PreTrainedTokenizerFast`` (or anything
else exposing ``return_offsets_mapping=True``); slow tokenizers
aren't supported — BPE drift at the wrap/body boundary would
defeat the whole point.

Empty input or empty joined text returns an empty list.
"""
Expand Down
46 changes: 14 additions & 32 deletions tests/test_load_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from types import SimpleNamespace
from unittest.mock import patch

import pytest

from renderers import base
from renderers.base import TOKENIZER_SOURCE_OVERRIDES, TRUSTED_REVISIONS, load_tokenizer

Expand Down Expand Up @@ -81,7 +83,7 @@ def test_meta_llama_loads_tokenizer_from_unsloth_mirror(mock_from_pretrained):
mirror = "unsloth/Llama-3.2-1B-Instruct"
mock_from_pretrained.return_value = SimpleNamespace(name_or_path=mirror)

tok = load_tokenizer(canonical, use_fastokens=False)
tok = load_tokenizer(canonical)

args, kwargs = mock_from_pretrained.call_args
assert args == (mirror,)
Expand Down Expand Up @@ -120,42 +122,22 @@ def test_tokenizer_source_overrides_are_exact_llama_mirrors():
}


def test_offset_tokenizer_uses_unsloth_mirror_for_meta_llama(monkeypatch):
"""Offset-tokenizer reloads must use the same unrestricted source
override, otherwise Llama rendering can hit the gated Meta repo after
the initial tokenizer load succeeds."""
def test_get_offset_tokenizer_rejects_offsetless_byo():
"""BYO tokenizers without ``return_offsets_mapping`` support raise a
clear error. Hand-coded renderers concatenate scaffold + body in one
BPE pass and attribute tokens via the fast tokenizer's offset map;
no transparent reload-from-name_or_path fallback exists. The
contract is: pass a fast tokenizer or get a loud error at construct
time, not silent BPE drift at the wrap/body boundary."""

class _NoOffsets:
name_or_path = "meta-llama/Llama-3.2-1B-Instruct"

def __call__(self, *args, **kwargs):
raise NotImplementedError("fastokens shim has no offsets")

class _OffsetTokenizer:
is_fast = True

def __init__(self, name_or_path: str):
self.name_or_path = name_or_path
name_or_path = "anywhere/anything"

def __call__(self, *args, **kwargs):
return {"offset_mapping": [(0, 1)]}

calls = []

def _fake_load(name_or_path, **kwargs):
calls.append((name_or_path, kwargs))
return _OffsetTokenizer(name_or_path)

base._offset_tokenizers.clear()
monkeypatch.setattr(base, "_load_tokenizer_via_auto", _fake_load)

try:
tok = base._get_offset_tokenizer(_NoOffsets())
finally:
base._offset_tokenizers.clear()
raise NotImplementedError("BYO tokenizer has no offsets")

assert calls == [("unsloth/Llama-3.2-1B-Instruct", {"trust_remote_code": False})]
assert tok.name_or_path == "meta-llama/Llama-3.2-1B-Instruct"
with pytest.raises(RuntimeError, match="fast tokenizer.*offsets"):
base._get_offset_tokenizer(_NoOffsets())


# ---------------------------------------------------------------------------
Expand Down
Loading
Loading