From afa4c9b274eed7d2266b17c9d6ea6a9bfd53f7d6 Mon Sep 17 00:00:00 2001 From: hallerite Date: Wed, 17 Jun 2026 23:49:00 +0000 Subject: [PATCH] feat(renderers): v3 collapse closures + tokenizers.Encoding offset API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related refactors of the emit_text_segments / attribute_text_segments pipeline: 1. ``emit_text_segments`` closures across 8 hand-coded renderers (qwen3, qwen35, glm45, glm5, deepseek_v3, nemotron3, laguna_xs2, minimax_m2) get a "collapse-or-fallback" pattern: adjacent same-label segments are folded into one ``emit_text`` call (preserves internal BPE merges, skips the offset path); only genuinely mixed-label runs go through ``attribute_text_segments``. Most rendering paths end up homogeneous after collapse, so the offset machinery only runs when it actually has to. 2. ``attribute_text_segments`` is rewritten to use the Rust ``tokenizers.Encoding`` API directly — ``.encode().ids`` / ``.encode().offsets`` — instead of going through ``transformers``'s ``return_offsets_mapping=True`` dict API. This unblocks the future ``transformers``-optional path (issue #31): a BYO ``tokenizers.Tokenizer`` works without any ``transformers`` wrapper. ``_get_offset_tokenizer`` becomes a 2-path resolver (direct Rust tokenizer, or extract ``.backend_tokenizer`` from a ``PreTrainedTokenizerFast``); no second tokenizer load, no probe-verify, no AutoTokenizer fallback — all of those existed in the previous version of this PR to coordinate with the fastokens shim, which is gone after #95. ``minimax_m2.emit_token_overlap_body`` and ``qwen3_vl._Emitter._flush`` are updated to call the new ``Encoding``-based offset API directly. ``tokenizers>=0.20`` becomes an explicit core dependency — it was already a transitive of ``transformers``, but the new ``attribute_text_segments`` imports from ``tokenizers`` at the module level so we declare it. Tests: 2248 passed, 88 skipped, 1 xfailed (baseline parity with #95). Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 5 ++ renderers/base.py | 109 ++++++++++++++++++----------------- renderers/deepseek_v3.py | 21 ++++++- renderers/glm45.py | 47 +++++++++++---- renderers/glm5.py | 47 +++++++++++---- renderers/laguna_xs2.py | 40 ++++++++++++- renderers/minimax_m2.py | 79 +++++++++++++++++-------- renderers/nemotron3.py | 40 ++++++++++++- renderers/qwen3.py | 47 +++++++++++---- renderers/qwen35.py | 47 +++++++++++---- renderers/qwen3_vl.py | 23 ++++---- tests/test_load_tokenizer.py | 21 ++++--- 12 files changed, 378 insertions(+), 148 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dc75397..b292663 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,11 @@ dependencies = [ "openai>=1.108.1", "tiktoken", "jinja2", + # HuggingFace's Rust BPE library. ``_get_offset_tokenizer`` uses + # ``tokenizers.Tokenizer.from_pretrained`` for offset-aware encoding + # (body/scaffold attribution on the render path) — keeps the heavy + # ``transformers`` framework off the offset path for most models. + "tokenizers>=0.20", "transformers>=4.50.0", # Used by GptOssRenderer to render and parse harmony tokens. Vendoring # OpenAI's reference implementation keeps us byte-identical with vLLM diff --git a/renderers/base.py b/renderers/base.py index 64a8760..20646a6 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1609,27 +1609,44 @@ def trim_to_turn_close( def _get_offset_tokenizer(tokenizer): - """Assert ``tokenizer`` supports ``return_offsets_mapping=True``. + """Return a ``tokenizers.Tokenizer`` (Rust BPE) for offset-aware encoding. Hand-coded renderers concatenate scaffold + body in one BPE pass to preserve cross-boundary merges, then attribute each resulting token - back to its source segment via the fast tokenizer's - ``offset_mapping`` (see :func:`attribute_text_segments`). The - contract: every BYO tokenizer must be a fast tokenizer with offset - support. Tokenizers loaded via :func:`load_tokenizer` are - ``PreTrainedTokenizerFast`` instances that satisfy this trivially. + back to its source segment via the Rust tokenizer's native + ``Encoding.offsets`` (see :func:`attribute_text_segments`). + + Resolution: + + 1. If ``tokenizer`` is already a ``tokenizers.Tokenizer``, return + it as-is (BYO Rust BPE — no extra load). + 2. If ``tokenizer.backend_tokenizer`` is a ``tokenizers.Tokenizer`` + (the standard ``PreTrainedTokenizerFast`` shape), use it + directly — no extra load. + + Errors loudly otherwise. Slow / wrapper tokenizers without a Rust + backend can't produce offsets, and silently falling back would + lose attribution at the wrap/body boundary. """ - try: - tokenizer("a", add_special_tokens=False, return_offsets_mapping=True) - except (NotImplementedError, ValueError, TypeError) as exc: - raise RuntimeError( - "Hand-coded renderers require a fast tokenizer with " - "``return_offsets_mapping=True`` support for body/scaffold " - "attribution. Pass a tokenizer loaded via " - "``renderers.base.load_tokenizer``, or any " - "``transformers.PreTrainedTokenizerFast`` instance." - ) from exc - return tokenizer + from tokenizers import Tokenizer as RustTokenizer + + # Path 1: already a tokenizers.Tokenizer. + if isinstance(tokenizer, RustTokenizer): + return tokenizer + + # Path 2: PreTrainedTokenizerFast exposes its underlying + # tokenizers.Tokenizer via ``backend_tokenizer``. + backend = getattr(tokenizer, "backend_tokenizer", None) + if isinstance(backend, RustTokenizer): + return backend + + raise RuntimeError( + "Hand-coded renderers require a fast tokenizer with a " + "``tokenizers.Tokenizer`` backend for body/scaffold attribution. " + "Pass a tokenizer loaded via ``renderers.base.load_tokenizer``, " + "any ``transformers.PreTrainedTokenizerFast`` instance, or a " + "``tokenizers.Tokenizer`` directly." + ) def attribute_text_segments( @@ -1644,22 +1661,23 @@ def attribute_text_segments( (content, True)]`` for a user message. Concatenation is done before encoding to preserve BPE merges across the wrap/body boundary; the resulting tokens are then attributed back to their source segment - via the fast tokenizer's ``offset_mapping``. + via ``tokenizers.Encoding.offsets``. A token is attributed to the segment containing its first source - character (``offset_mapping[k][0]``). Tokens whose first character - falls exactly on a segment boundary are attributed to the segment - that *starts* at that offset (the "later" segment). Zero-length - tokens (rare; usually pre-tokenizer artefacts) are attributed to - the most recently entered segment. - - Requires a HuggingFace fast tokenizer with offset tracking. Every - model in ``MODEL_RENDERER_MAP`` ships one, so the offset lookup - always succeeds for tokenizers obtained via :func:`load_tokenizer`. - BYO tokenizers must be a ``PreTrainedTokenizerFast`` (or anything - else exposing ``return_offsets_mapping=True``); slow tokenizers - aren't supported — BPE drift at the wrap/body boundary would - defeat the whole point. + character (``offsets[k][0]``). Tokens whose first character falls + exactly on a segment boundary go to the segment that *starts* at + that offset (the "later" segment). Zero-length tokens (rare; + pre-tokenizer artefacts) are attributed to the most recently + entered segment. + + Uses the Rust ``tokenizers`` library directly via + :func:`_get_offset_tokenizer` — no ``transformers`` dependency on + this path. Every model in ``MODEL_RENDERER_MAP`` ships a fast + ``tokenizer.json`` so the Rust backend resolves universally. BYO + tokenizers must expose a ``tokenizers.Tokenizer`` (either directly + or as ``PreTrainedTokenizerFast.backend_tokenizer``); slow + tokenizers aren't supported — BPE drift at the wrap/body boundary + would defeat the whole point. Empty input or empty joined text returns an empty list. """ @@ -1670,47 +1688,34 @@ def attribute_text_segments( return [] offset_tokenizer = _get_offset_tokenizer(tokenizer) - encoding = offset_tokenizer( - full_text, - add_special_tokens=False, - return_offsets_mapping=True, - ) - token_ids = list(encoding["input_ids"]) - offsets = list(encoding["offset_mapping"]) + encoding = offset_tokenizer.encode(full_text, add_special_tokens=False) + token_ids = list(encoding.ids) + offsets = list(encoding.offsets) # Build segment char-span lookup. Track the half-open span # [seg_start, seg_end) of each segment and its is_content bit. - spans: list[tuple[int, int, bool]] = [] + spans: "list[tuple[int, int, bool]]" = [] pos = 0 for text, is_content in segments: spans.append((pos, pos + len(text), is_content)) pos += len(text) total_len = pos - out: list[tuple[int, bool]] = [] + out: "list[tuple[int, bool]]" = [] last_is_content = spans[-1][2] if spans else False for tok_id, (start, _end) in zip(token_ids, offsets): if start >= total_len: - # Token's character offset is past every segment (shouldn't - # normally happen for add_special_tokens=False, but defensive - # against tokenizer-specific edge cases). + # Token's char offset is past every segment (shouldn't + # normally happen for add_special_tokens=False, but defensive). out.append((tok_id, last_is_content)) continue - # Find the segment that contains `start`. Segments are - # contiguous and ordered, so a linear scan is fine — the inner - # loop runs at most len(segments) times per token and segments + # Find the segment that contains `start`. Linear scan — segments # is typically 2-3 in practice. is_content = last_is_content for seg_start, seg_end, seg_is_content in spans: if seg_start <= start < seg_end: is_content = seg_is_content break - else: - # start == total_len handled above; the remaining case is - # an empty segment in the middle. Empty segments emit no - # characters, so no token can land in them; fall through to - # the last non-empty segment's bit. - pass out.append((tok_id, is_content)) return out diff --git a/renderers/deepseek_v3.py b/renderers/deepseek_v3.py index 5f4840a..01b23bc 100644 --- a/renderers/deepseek_v3.py +++ b/renderers/deepseek_v3.py @@ -17,11 +17,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, trim_to_turn_close, @@ -153,8 +153,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) diff --git a/renderers/glm45.py b/renderers/glm45.py index 7af9259..73ff601 100644 --- a/renderers/glm45.py +++ b/renderers/glm45.py @@ -16,11 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -146,15 +146,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -377,8 +387,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/glm5.py b/renderers/glm5.py index 924d754..bd344e7 100644 --- a/renderers/glm5.py +++ b/renderers/glm5.py @@ -17,11 +17,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -166,15 +166,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -397,8 +407,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/laguna_xs2.py b/renderers/laguna_xs2.py index bd6b64f..583b7aa 100644 --- a/renderers/laguna_xs2.py +++ b/renderers/laguna_xs2.py @@ -30,12 +30,12 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Content, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, ) @@ -169,8 +169,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -382,8 +399,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/minimax_m2.py b/renderers/minimax_m2.py index f990274..d690c70 100644 --- a/renderers/minimax_m2.py +++ b/renderers/minimax_m2.py @@ -17,11 +17,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -133,8 +133,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -152,23 +169,22 @@ def emit_token_overlap_body( """Tokenize ``full_text`` and mark tokens that overlap the body char span as ``is_content=True``. - Differs from :func:`attribute_text_segments` only in the - boundary-token rule: a token straddling scaffold→body gets - ``True`` if any of its bytes are body bytes (overlap rule), - rather than being attributed to whichever segment its first - char belongs to. The body's first byte is preserved even when - BPE merges it with the wrap's trailing byte (``>The`` → - single token). + Uses an "intersects body span" rule: a token straddling + scaffold→body gets ``True`` if any of its bytes are body + bytes, rather than being attributed to whichever segment its + first char belongs to. The body's first byte is preserved + even when BPE merges it with the wrap's trailing byte + (``>The`` → single token). The other renderers don't need + this because their scaffolds break at characters BPE + doesn't merge across (``\\n``, special tokens); the + ``...`` template here glues scaffold and body + with no separator. """ from renderers.base import _get_offset_tokenizer offset_tok = _get_offset_tokenizer(self._tokenizer) - encoding = offset_tok( - full_text, add_special_tokens=False, return_offsets_mapping=True - ) - for tok_id, (start, end) in zip( - encoding["input_ids"], encoding["offset_mapping"] - ): + encoding = offset_tok.encode(full_text, add_special_tokens=False) + for tok_id, (start, end) in zip(encoding.ids, encoding.offsets): overlaps = start < body_end and end > body_start tokens.append(tok_id) indices.append(msg_idx) @@ -381,8 +397,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) @@ -627,15 +660,13 @@ def _render_tool( # ```` is plain text with no separator between the # closing ``>`` and ``content``'s first byte, so BPE can merge - # them into a single token (e.g., ``>The``). The shared - # ``attribute_text_segments`` helper picks the segment of a - # boundary-spanning token by its *first* char (here scaffold), - # which would drop the body's leading letter out of the body - # run. We instead use an "intersects body" rule: any token whose - # ``[start, end)`` char range overlaps the body span gets + # them into a single token (e.g., ``>The``). A "first char + # wins" rule would drop the body's leading letter out of the + # body run. We instead use an "intersects body" rule: any token + # whose ``[start, end)`` char range overlaps the body span gets # ``is_content=True``. A few scaffold bytes (the leading ``>`` - # or trailing ``<``) bleed into the body run, but body bytes are - # recoverable as a substring of the decoded body span. + # or trailing ``<``) bleed into the body run, but body bytes + # are recoverable as a substring of the decoded body span. body_text = prefix + "" + content + "" + suffix body_start = len(prefix) + len("") body_end = body_start + len(content) diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index c29129c..950009a 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -20,11 +20,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -316,8 +316,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -596,8 +613,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/qwen3.py b/renderers/qwen3.py index 9253b3d..143e16c 100644 --- a/renderers/qwen3.py +++ b/renderers/qwen3.py @@ -14,11 +14,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -127,15 +127,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -351,8 +361,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/qwen35.py b/renderers/qwen35.py index cdb8ee1..5d95478 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -20,13 +20,13 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, MultiModalData, ParsedResponse, PlaceholderRange, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -341,15 +341,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -706,8 +716,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 7b82d7e..c4cb5d9 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -36,13 +36,13 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, MultiModalData, ParsedResponse, PlaceholderRange, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, trim_to_turn_close, @@ -223,11 +223,10 @@ def text(self, text: str, *, is_sampled: bool, is_content: bool) -> None: if not text: return # Adjacent text under different msg_idx or is_sampled is rare in - # this template — but flush at those boundaries so attribution - # and the sampled signal stay accurate. is_content boundaries do - # NOT force a flush: they're carried through the joined BPE pass - # via :func:`attribute_text_segments`, preserving merges across - # the wrap/body boundary. + # this template — but flush at those boundaries so the sampled + # signal stays accurate. is_content boundaries do NOT force a + # flush: mixed-is_content flushes encode each segment + # independently (see ``_flush``). if self._segments and ( self._buf_idx != self.msg_idx or self._buf_sampled != is_sampled ): @@ -274,13 +273,11 @@ def _flush(self) -> None: self.sampled.extend([self._buf_sampled] * len(ids)) self.is_content.extend([first_ic] * len(ids)) return - # Mixed body/scaffold flush — encode once and attribute back to - # each segment via the fast tokenizer's offset_mapping. Requires - # a tokenizer (not just the encode fn) to look up offsets. - assert self._tokenizer is not None, ( - "_Emitter mixed-is_content flush requires a tokenizer; " - "pass one to the constructor." - ) + # Mixed body/scaffold flush — joined encode + offset attribution + # preserves BPE merges across the label-transition boundary + # (e.g., ``"user\n"`` scaffold ↔ caller body, where a trailing + # char of the body could merge with the leading scaffold byte + # of the next segment). for tok_id, is_content in attribute_text_segments(self._tokenizer, segments): self.token_ids.append(tok_id) self.message_indices.append(self._buf_idx) diff --git a/tests/test_load_tokenizer.py b/tests/test_load_tokenizer.py index ea15d6a..72e9d0e 100644 --- a/tests/test_load_tokenizer.py +++ b/tests/test_load_tokenizer.py @@ -123,21 +123,20 @@ def test_tokenizer_source_overrides_are_exact_llama_mirrors(): def test_get_offset_tokenizer_rejects_offsetless_byo(): - """BYO tokenizers without ``return_offsets_mapping`` support raise a + """BYO tokenizers without a ``tokenizers.Tokenizer`` backend raise a clear error. Hand-coded renderers concatenate scaffold + body in one - BPE pass and attribute tokens via the fast tokenizer's offset map; - no transparent reload-from-name_or_path fallback exists. The - contract is: pass a fast tokenizer or get a loud error at construct - time, not silent BPE drift at the wrap/body boundary.""" + BPE pass and attribute tokens via the Rust tokenizer's + ``Encoding.offsets``; no transparent reload-from-name_or_path + fallback exists. The contract is: pass a fast tokenizer with a + Rust backend or get a loud error at construct time, not silent BPE + drift at the wrap/body boundary.""" - class _NoOffsets: + class _NoRustBackend: name_or_path = "anywhere/anything" + # No ``backend_tokenizer`` attribute, not a ``tokenizers.Tokenizer``. - def __call__(self, *args, **kwargs): - raise NotImplementedError("BYO tokenizer has no offsets") - - with pytest.raises(RuntimeError, match="fast tokenizer.*offsets"): - base._get_offset_tokenizer(_NoOffsets()) + with pytest.raises(RuntimeError, match=r"fast tokenizer.*tokenizers\.Tokenizer"): + base._get_offset_tokenizer(_NoRustBackend()) # ---------------------------------------------------------------------------