diff --git a/renderers/base.py b/renderers/base.py index ed5bc7e..2bacd70 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1,8 +1,6 @@ from __future__ import annotations -import contextlib import enum -import io import logging import queue import threading @@ -1164,7 +1162,7 @@ def _model_has_vision_config(model_name: str) -> bool: # Models for which ``fastokens`` is known to diverge from vanilla -# ``transformers.AutoTokenizer`` and therefore must NOT be patched. +# ``transformers.AutoTokenizer`` and therefore must NOT be adapted. # Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both # backends. The entries below fail to load under fastokens (DeepSeek-V3 # family — Metaspace pretokenizer not yet implemented). @@ -1182,7 +1180,7 @@ def _model_has_vision_config(model_name: str) -> bool: ) -_FASTOKENS_PATCH_LOCK = threading.Lock() +_FASTOKENS_ANNOUNCE_LOCK = threading.Lock() _FASTOKENS_ANNOUNCED = False @@ -1222,46 +1220,27 @@ def _preserve_requested_tokenizer_name( return tokenizer -def _patched_load(model_name_or_path: str, **kwargs): - """Run ``AutoTokenizer.from_pretrained`` with fastokens patched in - process-locally — patch around the load, unpatch right after. - - fastokens captures the loaded backend on a per-tokenizer basis, so - after we unpatch the returned tokenizer object continues to use - fastokens for ``encode``/``decode`` while subsequent - ``AutoTokenizer.from_pretrained`` calls (outside our control) go - back to vanilla. This keeps the global side effect minimal. - - fastokens itself prints ``[fastokens] patch_transformers: ...`` to - stdout on every patch/unpatch call. Building a pool of size N would - therefore emit ~N lines (more under thread contention, where some - threads see ``already patched``). We swallow those prints under a - lock — ``contextlib.redirect_stdout`` swaps ``sys.stdout`` - process-wide, so the lock keeps unrelated stdout writes from other - threads from disappearing into our buffer. The patch/unpatch calls - are cheap; only the brief patch+unpatch is serialized, the actual - ``from_pretrained`` still runs concurrently across pool slots. A - single ``logger.info`` is emitted on the first patch so the fast - path is still discoverable in logs. - """ - import fastokens +def _adapt_tokenizer_with_fastokens(tokenizer): + """Replace only one fully loaded tokenizer's backend with fastokens.""" + # This is the same shim used by patch_transformers, scoped to one object. + from fastokens._compat import _TokenizerShim global _FASTOKENS_ANNOUNCED - with _FASTOKENS_PATCH_LOCK: - with contextlib.redirect_stdout(io.StringIO()): - fastokens.patch_transformers() + backend = getattr(tokenizer, "_tokenizer", None) + if backend is None: + raise TypeError( + f"{type(tokenizer).__name__} has no fast tokenizer backend to adapt" + ) + tokenizer._tokenizer = _TokenizerShim(backend) + + with _FASTOKENS_ANNOUNCE_LOCK: if not _FASTOKENS_ANNOUNCED: logger.info( "fastokens enabled — tokenizers load through the Rust BPE fast path (~10x encode speedup)." ) _FASTOKENS_ANNOUNCED = True - try: - return _load_tokenizer_via_auto(model_name_or_path, **kwargs) - finally: - with _FASTOKENS_PATCH_LOCK: - with contextlib.redirect_stdout(io.StringIO()): - fastokens.unpatch_transformers() + return tokenizer def _load_fast_tokenizer_directly( @@ -1335,23 +1314,21 @@ def load_tokenizer( ``trust_remote_code=True`` AND a pinned ``revision=`` so transformers only executes the reviewed commit's tokenizer Python. - **Performance** — ``use_fastokens=True`` (default) routes the load - through ``fastokens.patch_transformers()`` so the resulting tokenizer - encodes ~10x faster than vanilla ``tokenizers``. The patch is - bracketed: it's applied before ``from_pretrained`` and removed - immediately after, so global ``AutoTokenizer.from_pretrained`` calls - elsewhere in the user's process are not affected. + **Performance** — ``use_fastokens=True`` (default) replaces only the + returned tokenizer's backend with the fastokens compatibility shim, so it + encodes ~10x faster than vanilla ``tokenizers`` without patching + Transformers process-wide during the load. Models in ``FASTOKENS_INCOMPATIBLE`` (DeepSeek-V3 family) skip the - patch — fastokens currently fails to load them. Pass + adaptation — fastokens currently fails to load them. Pass ``use_fastokens=False`` to force the vanilla backend for any other model. Unknown / fine-tuned model paths fall through to - ``trust_remote_code=False`` and the patched-load fast path. If - fastokens raises during the patched load (e.g. an unknown - pre-tokenizer type), we automatically retry with the vanilla - backend and emit an INFO log. + ``trust_remote_code=False`` and the fastokens adaptation path. If + fastokens rejects the loaded backend (e.g. an unknown pre-tokenizer + type), we automatically retry with the vanilla backend and emit an + INFO log. ``AutoTokenizer.from_pretrained`` eagerly builds the model config to resolve the tokenizer class. If that construction raises on a @@ -1368,26 +1345,19 @@ def load_tokenizer( load_name_or_path = _tokenizer_source_for(model_name_or_path) kwargs = _tokenizer_load_kwargs(load_name_or_path) - if not use_fastokens or load_name_or_path in FASTOKENS_INCOMPATIBLE: - tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) - return _preserve_requested_tokenizer_name( - tok, - requested_name_or_path=model_name_or_path, - loaded_name_or_path=load_name_or_path, - ) - - try: - tok = _patched_load(load_name_or_path, **kwargs) - except Exception as exc: - logger.info( - "fastokens could not load %r (%s: %s); falling back to vanilla " - "AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in " - "renderers.base to suppress the retry.", - load_name_or_path, - type(exc).__name__, - str(exc)[:160], - ) - tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) + tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) + if use_fastokens and load_name_or_path not in FASTOKENS_INCOMPATIBLE: + try: + tok = _adapt_tokenizer_with_fastokens(tok) + except Exception as exc: + logger.info( + "fastokens could not adapt %r (%s: %s); using vanilla " + "AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in " + "renderers.base to suppress this message.", + load_name_or_path, + type(exc).__name__, + str(exc)[:160], + ) return _preserve_requested_tokenizer_name( tok, @@ -1721,8 +1691,8 @@ def trim_to_turn_close( # Per-model offset-aware tokenizer cache. ``attribute_text_segments`` # uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute # each token to its source text segment under one BPE pass. Fastokens -# (the Rust BPE we patch in by default for ~10x faster encode) does not -# track character offsets — the patched tokenizer's +# (the Rust BPE backend we install by default for ~10x faster encode) does not +# track character offsets — the adapted tokenizer's # ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we # keep a parallel vanilla tokenizer per model purely for offset queries. # Memory cost is one extra tokenizer per *unique* model name across all @@ -1773,44 +1743,18 @@ def _has_offsets(tok) -> bool: except (NotImplementedError, ValueError, TypeError): return False - # We want HF's Rust tokenizer with offset tracking, not the fastokens - # shim. The shim is installed by a *process-global* monkeypatch that - # ``load_tokenizer`` toggles per pool-slot load, so a plain reload here - # can race a concurrent slot's open patch window and silently pick up - # the offset-less shim (then get cached, poisoning the process). So: - # load, verify offsets, and if missing, reload with the patch forced - # off — serialized against pool patch/unpatch via ``_FASTOKENS_PATCH_LOCK`` - # so no concurrent window can swap the shim back in mid-load — then - # restore the prior patch state. Never cache a non-offset tokenizer. + # This path deliberately loads through vanilla Transformers because + # fastokens adaptation is scoped to ``load_tokenizer``'s returned object. offset_tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) offset_tok = _preserve_requested_tokenizer_name( offset_tok, requested_name_or_path=name_or_path, loaded_name_or_path=load_name_or_path, ) - if not _has_offsets(offset_tok): - import fastokens - - with _FASTOKENS_PATCH_LOCK: - was_patched = bool(getattr(fastokens, "_patched", False)) - if was_patched: - with contextlib.redirect_stdout(io.StringIO()): - fastokens.unpatch_transformers() - try: - offset_tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) - offset_tok = _preserve_requested_tokenizer_name( - offset_tok, - requested_name_or_path=name_or_path, - loaded_name_or_path=load_name_or_path, - ) - finally: - if was_patched: - with contextlib.redirect_stdout(io.StringIO()): - fastokens.patch_transformers() if not _has_offsets(offset_tok): raise RuntimeError( f"Could not load an offset-capable tokenizer for {name_or_path!r}: " - "offset_mapping is unavailable even with the fastokens patch off. " + "offset_mapping is unavailable from vanilla Transformers. " "Hand-coded renderers require a fast tokenizer for body/scaffold " "attribution." ) @@ -1840,7 +1784,7 @@ def attribute_text_segments( the most recently entered segment. Requires a HuggingFace fast tokenizer with offset tracking. The - ``fastokens`` patch ``load_tokenizer`` applies by default does + The ``fastokens`` backend ``load_tokenizer`` installs by default does **not** track offsets — when that's the case we transparently load a vanilla offset-capable tokenizer for the same model and cache it (see :func:`_get_offset_tokenizer`). Hand-coded renderers are only diff --git a/tests/test_load_tokenizer_fastokens.py b/tests/test_load_tokenizer_fastokens.py index 28c42a0..789fd8d 100644 --- a/tests/test_load_tokenizer_fastokens.py +++ b/tests/test_load_tokenizer_fastokens.py @@ -1,8 +1,8 @@ """Coverage for the fastokens fast-path in ``renderers.base.load_tokenizer``. -``load_tokenizer`` defaults to routing every supported model through -``fastokens.patch_transformers()`` for ~10x faster encode. Models in -``FASTOKENS_INCOMPATIBLE`` skip the patch (DeepSeek's Metaspace +``load_tokenizer`` defaults to adapting every supported model's returned +backend with fastokens for ~10x faster encode. Models in +``FASTOKENS_INCOMPATIBLE`` skip adaptation (DeepSeek's Metaspace pretokenizer isn't supported). Callers can opt out per-call with ``use_fastokens=False``. @@ -16,16 +16,18 @@ 3. With ``use_fastokens=False``, the resulting tokenizer is vanilla. 4. For incompat models, the fast path is silently skipped and the tokenizer still loads + encodes correctly. -5. The fastokens patch is removed immediately after the load so it - doesn't leak into the caller's process — subsequent - ``AutoTokenizer.from_pretrained`` calls outside ``load_tokenizer`` - use vanilla. +5. Fastokens adaptation is scoped to the returned tokenizer, so concurrent + and subsequent ``AutoTokenizer.from_pretrained`` calls stay vanilla. """ from __future__ import annotations +import concurrent.futures +import threading + import pytest -from transformers import AutoTokenizer +from tokenizers import Tokenizer, models +from transformers import AutoTokenizer, PreTrainedTokenizerFast from renderers.base import ( FASTOKENS_INCOMPATIBLE, @@ -93,13 +95,16 @@ def test_fast_and_vanilla_encode_identically_on_compatible_model(): " ".join([f"word_{i}" for i in range(50)]), ] for s in samples: - assert fast.encode(s, add_special_tokens=False) == vanilla.encode( - s, add_special_tokens=False - ), f"encode diverged on {s!r}" + fast_ids = fast.encode(s, add_special_tokens=False) + vanilla_ids = vanilla.encode(s, add_special_tokens=False) + assert fast_ids == vanilla_ids, f"encode diverged on {s!r}" + assert fast.decode(fast_ids) == vanilla.decode(vanilla_ids), ( + f"decode diverged on {s!r}" + ) # --------------------------------------------------------------------------- -# Denylist: incompat models silently skip the patch and still load. +# Denylist: incompat models silently skip adaptation and still load. # --------------------------------------------------------------------------- @@ -120,7 +125,7 @@ def test_incompat_model_loads_via_vanilla_backend(model): pytest.skip(f"{model}: repo unreachable in this env ({e})") tok = load_tokenizer(model) assert "Shim" not in _backend_class_name(tok), ( - f"{model}: should NOT have been patched; got {_backend_class_name(tok)!r}" + f"{model}: should NOT have been adapted; got {_backend_class_name(tok)!r}" ) # And it still encodes. ids = tok.encode("hello", add_special_tokens=False) @@ -128,41 +133,70 @@ def test_incompat_model_loads_via_vanilla_backend(model): # --------------------------------------------------------------------------- -# Patch must not leak: AutoTokenizer.from_pretrained calls OUTSIDE -# load_tokenizer should still produce a vanilla tokenizer. +# Adaptation must not leak: AutoTokenizer.from_pretrained calls OUTSIDE +# load_tokenizer should always produce a vanilla tokenizer. # --------------------------------------------------------------------------- -def test_patch_is_unloaded_after_call(): - """``load_tokenizer`` brackets the fastokens patch. After it returns - a fastokens-shimmed tokenizer, a fresh ``AutoTokenizer.from_pretrained`` - call must NOT pick up the patch — the user's process stays clean.""" +def test_fastokens_is_scoped_to_loaded_tokenizer(): + """A fresh ``AutoTokenizer`` call stays vanilla after fast adaptation.""" fast = load_tokenizer(_FAST_MODEL) assert "Shim" in _backend_class_name(fast), "preconditions: fast path active" # Now call AutoTokenizer.from_pretrained directly. It MUST be vanilla. direct = AutoTokenizer.from_pretrained(_FAST_MODEL, trust_remote_code=False) assert "Shim" not in _backend_class_name(direct), ( - f"fastokens patch leaked into user-side AutoTokenizer call: " + f"fastokens leaked into user-side AutoTokenizer call: " f"got {_backend_class_name(direct)!r}" ) +def test_fastokens_load_does_not_patch_transformers_concurrently(monkeypatch, tmp_path): + """A slow renderer load must not expose fastokens to unrelated callers.""" + import renderers.base as rb + + backend = Tokenizer(models.BPE({"[UNK]": 0, "hello": 1}, [], unk_token="[UNK]")) + PreTrainedTokenizerFast( + tokenizer_object=backend, unk_token="[UNK]" + ).save_pretrained(tmp_path) + + started = threading.Event() + release = threading.Event() + tokenizer = AutoTokenizer.from_pretrained(tmp_path) + + def slow_load(*args, **kwargs): + started.set() + assert release.wait(timeout=5) + return tokenizer + + monkeypatch.setattr(rb, "_load_tokenizer_via_auto", slow_load) + + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + loaded = executor.submit(load_tokenizer, "test-model") + assert started.wait(timeout=5) + try: + direct = AutoTokenizer.from_pretrained(tmp_path) + assert "Shim" not in _backend_class_name(direct) + finally: + release.set() + + assert "Shim" in _backend_class_name(loaded.result()) + + # --------------------------------------------------------------------------- -# Failure-mode fallback: if fastokens raises during the patched load, +# Failure-mode fallback: if fastokens raises during per-tokenizer adaptation, # load_tokenizer falls back to vanilla without surfacing the error. # --------------------------------------------------------------------------- -def test_fallback_on_fastokens_load_error(monkeypatch): - """Simulate fastokens raising during patched load — load_tokenizer - should fall back to vanilla and return a working tokenizer.""" +def test_fallback_on_fastokens_adaptation_error(monkeypatch): + """An adaptation error returns the already-loaded vanilla tokenizer.""" import renderers.base as rb def _boom(*args, **kwargs): raise ValueError("simulated fastokens failure: unsupported pre-tokenizer") - monkeypatch.setattr(rb, "_patched_load", _boom) + monkeypatch.setattr(rb, "_adapt_tokenizer_with_fastokens", _boom) tok = load_tokenizer(_FAST_MODEL) # default use_fastokens=True # The vanilla fallback ran — backend is not a fastokens shim. @@ -172,18 +206,12 @@ def _boom(*args, **kwargs): # --------------------------------------------------------------------------- -# Print suppression: fastokens itself prints "[fastokens] -# patch_transformers: ..." on every patch/unpatch call. Building a -# RendererPool of size N would emit ~N lines (the pool factory calls -# load_tokenizer once per slot). load_tokenizer swallows that stdout -# chatter and emits a single INFO log on the first patch instead. +# Fastokens adaptation emits one INFO log per process, not once per pool slot. # --------------------------------------------------------------------------- def test_no_fastokens_stdout_chatter(capsys, caplog): - """``load_tokenizer`` must not leak ``[fastokens]`` prints onto - stdout, and must emit exactly one INFO log per process announcing - the fast path (not once per call).""" + """Fast adaptation stays quiet and announces its path once per process.""" import logging import renderers.base as rb