Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
8e2fd79
chore(tokenizer wrapper): Adapted a wrapper for sp tokenizer.
ajude2s Dec 16, 2024
e5c3b28
fix: checkpoint conversion to HF
flxst Dec 16, 2024
17585fd
chore(tokenizer wrapper): SP Tokenizer wrapper for Modalities tokenizer.
ajude2s Dec 18, 2024
b115bce
chore(tokenizer wrapper): SP Tokenizer wrapper for Modalities tokenizer.
ajude2s Dec 19, 2024
d26858b
Merge branch 'fix/checkpoint_conversion_to_hf' into eval_modalities2
ajude2s Dec 19, 2024
f7fb4be
chore(tokenizer wrapper): Testing
ajude2s Jan 13, 2025
78ee8df
Merge remote-tracking branch 'origin/main' into eval_modalities2
BlueCrescent Feb 6, 2025
4c046b1
Merge remote-tracking branch 'origin/main' into eval_modalities2
BlueCrescent Feb 12, 2025
123d3d2
chore: Merge remote-tracking branch 'origin/main' into eval_modalities2
BlueCrescent Feb 21, 2025
02bac94
fix(huggingface): Fixed bug in hf adapter config (probably caused by …
BlueCrescent Feb 21, 2025
f63ac51
test(huggingface): Required update of test config for tests to pass.
BlueCrescent Feb 25, 2025
99c5788
test(huggingface): Added additional tests for checkpoint conversion.
BlueCrescent Feb 26, 2025
85139fd
Added type hints for the tests.
ajude2s Feb 26, 2025
dcb1de8
Merge remote-tracking branch 'origin/eval_modalities2' into eval_moda…
ajude2s Feb 26, 2025
c215304
Removed mismatch cases from the test.
ajude2s Feb 26, 2025
e971024
Added the copyright notice of the llama2 implementation of the tokeni…
ajude2s Feb 27, 2025
07285d2
Merge branch 'main' into eval_modalities2
BlueCrescent Mar 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 243 additions & 4 deletions src/modalities/models/huggingface_adapters/hf_adapter.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
import json
from dataclasses import dataclass
from pathlib import PosixPath
from typing import Any, Optional
from typing import Any, Dict, List, Optional

import torch
from transformers import PretrainedConfig, PreTrainedModel
from class_resolver.utils import logger
from transformers import AddedToken, PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
from transformers.tokenization_utils_base import TextInput
from transformers.utils import ModelOutput

from modalities.models.model import NNModel
from modalities.models.utils import ModelTypeEnum, get_model_from_config
from modalities.models.utils import ModelTypeEnum, get_model_from_config, get_tokenizer_from_config

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
SPIECE_UNDERLINE = "▁"


class HFModelAdapterConfig(PretrainedConfig):
"""HFModelAdapterConfig configuration class for the HFModelAdapter."""

model_type = "modalities"

def __init__(self, **kwargs):
def __init__(self, config={}, **kwargs):
Comment thread
ajude2s marked this conversation as resolved.
Outdated
"""
Initializes an HFModelAdapterConfig object.

Expand All @@ -29,6 +34,7 @@ def __init__(self, **kwargs):
if "config" not in kwargs:
raise ValueError("Config is not passed in HFModelAdapterConfig.")
super().__init__(**kwargs)
self.config = config
# self.config is added by the super class via kwargs
assert self.config is not None, "Config is not passed in HFModelAdapterConfig."
# since the config will be saved to json and json can't handle posixpaths, we need to convert them to strings
Expand Down Expand Up @@ -158,3 +164,236 @@ class ModalitiesModelOutput(ModelOutput):
logits: Optional[torch.FloatTensor] = None
hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[torch.FloatTensor]] = None


class HFTokenizerAdapter(PreTrainedTokenizer):
"""
Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
no padding token in the original model.

Args:
vocab_file (`str`):
Path to the vocabulary file.
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
The end of sequence token.
pad_token (`str` or `tokenizers.AddedToken`, *optional*):
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
attention mechanisms or loss computation.
sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
to set:

- `enable_sampling`: Enable subword regularization.
- `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

- `nbest_size = {0,1}`: No sampling is performed.
- `nbest_size > 1`: samples from the nbest_size results.
- `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.

add_bos_token (`bool`, *optional*, defaults to `True`):
Whether or not to add an `bos_token` at the start of sequences.
add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used.
spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to add spaces between special tokens.
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens.
Make sure to also set `from_slow` to `True`.
A simple example:

- `legacy=True`:
```python
# >>> from transformers import LlamaTokenizerFast
#
# >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
# >>> tokenizer.encode("Hello <s>.") # 869 is '▁.'
[1, 15043, 29871, 1, 869]
```
- `legacy=False`:
```python
# >>> from transformers import LlamaTokenizerFast
#
# >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
# >>> tokenizer.encode("Hello <s>.") # 29889 is '.'
[1, 15043, 29871, 1, 29889]
```
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
add_prefix_space (`bool`, *optional*, defaults to `True`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
"""

vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]

def __init__(
self,
config: HFModelAdapterConfig,
# vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
use_default_system_prompt=False,
spaces_between_special_tokens=False,
legacy=None,
add_prefix_space=True,
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token

if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes"
" for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you"
" understand what it means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a"
" GGUF file you can ignore this message"
)
legacy = True

self.legacy = legacy
# self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.use_default_system_prompt = use_default_system_prompt
self.sp_model = get_tokenizer_from_config(config.config, "tokenizer")
self.add_prefix_space = add_prefix_space

super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
use_default_system_prompt=use_default_system_prompt,
spaces_between_special_tokens=spaces_between_special_tokens,
legacy=legacy,
add_prefix_space=add_prefix_space,
**kwargs,
)

@property
def unk_token_length(self):
return len(self.sp_model.tokenizer.encode(str(self.unk_token)))

@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.vocab_size

def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
# Load the configuration
config = HFModelAdapterConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)

# Create a new tokenizer instance
tokenizer = cls(config=config, legacy=True, **kwargs)

return tokenizer

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
"""
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
"""
if self.legacy or len(text) == 0:
return super().tokenize(text, **kwargs)

text = text.replace(SPIECE_UNDERLINE, " ")
if self.add_prefix_space:
text = SPIECE_UNDERLINE + text

tokens = super().tokenize(text, **kwargs)

if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.

We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
return self.sp_model.tokenizer.encode(text, out_type=str)

# 1. Encode string + prefix ex: "<unk> Hey"
tokens = self.sp_model.tokenizer.encode(self.unk_token + text, out_type=str)
# 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From Klaudia: The tokenization logic manually adds before encoding and removes it afterward, assuming consistent tokenization. If tokenization behavior changes (e.g., due to different training conditions), this approach may break.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is required since llama2 tokenizer has removed add_dummy_prefix option, which automatically adds a leading white space every input, even if it’s unnecessary, which could interfere with tokenization. This new method introduced in Llama2 of using <unk> basically allows us to control when spaces should be preserved. The leading spaces are preserved only if it is originally present in the input.

if tokenization behavior changes

did you mean if the tokenizer is trained to interpret <unk> differently? The function unk_token_length dynamically calculates how many tokens represent the token when it is encoded. If the <unk> token is treated as a single token (e.g., ['<unk>']), the length will be 1. If it's split into multiple tokens (e.g., ['<', 'unk', '>']), the length will be 3.
By using this dynamically calculated length, the tokenizer ensures that it removes the correct number of tokens — no matter how the token is treated internally by the tokenizer.

tokens = ['<unk>', '▁Hey']
unk_token_length = 1
# After removing the `<unk>` token (with length 1)
tokens = tokens[1:]  # Result: ['▁Hey']
tokens = ['<', 'unk', '>', '▁Hey']
unk_token_length = 3
# After removing the `<unk>` token (with length 3)
tokens = tokens[3:]  # Result: ['▁Hey']


def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.tokenizer.piece_to_id(token)

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.tokenizer.IdToPiece(index)
return token

def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# since we manually add the prefix space, we have to remove it when decoding
if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
tokens[0] = tokens[0][1:]

current_sub_tokens = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0 and self.legacy:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
if prev_is_special and i == 1 and self.add_prefix_space and not token.startswith(SPIECE_UNDERLINE):
out_string += " "
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
15 changes: 14 additions & 1 deletion src/modalities/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pydantic import BaseModel

from modalities.config.component_factory import ComponentFactory
from modalities.config.pydanctic_if_types import PydanticPytorchModuleType
from modalities.config.pydanctic_if_types import PydanticPytorchModuleType, PydanticTokenizerIFType
from modalities.registry.components import COMPONENTS
from modalities.registry.registry import Registry

Expand Down Expand Up @@ -54,3 +54,16 @@ class PydanticConfig(BaseModel):

components = component_factory.build_components(config_dict=config, components_model_type=PydanticConfig)
return getattr(components, model_type.value)


def get_tokenizer_from_config(config: dict, tokenizer_type: str):
registry = Registry(COMPONENTS)
component_factory = ComponentFactory(registry=registry)

class PydanticConfig(BaseModel):
tokenizer: PydanticTokenizerIFType

components = component_factory.build_components(
config_dict=config, components_model_type=PydanticConfig
)
return getattr(components, tokenizer_type)
Loading