|
| 1 | +import logging |
1 | 2 | from tempfile import TemporaryDirectory |
2 | 3 |
|
3 | 4 | import numpy as np |
4 | 5 | import pytest |
5 | 6 | import torch |
| 7 | +from skeletoken import TokenizerModel |
6 | 8 | from tokenizers import Tokenizer |
7 | 9 | from transformers import AutoTokenizer |
8 | 10 |
|
9 | 11 | from model2vec.model import StaticModel |
10 | 12 | from model2vec.train import StaticModelForClassification |
11 | 13 | from model2vec.train.base import FinetunableStaticModel, TextDataset |
| 14 | +from model2vec.train.utils import get_probable_pad_token_id |
12 | 15 |
|
13 | 16 |
|
14 | 17 | @pytest.mark.parametrize("n_layers", [0, 1, 2, 3]) |
@@ -67,6 +70,21 @@ def test_init_classifier_from_model(mock_vectors: np.ndarray, mock_tokenizer: To |
67 | 70 | assert s.w.shape[0] == mock_vectors.shape[0] |
68 | 71 |
|
69 | 72 |
|
| 73 | +def test_pad_token(mock_tokenizer: Tokenizer) -> None: |
| 74 | + """Test initializion from a static model.""" |
| 75 | + tokenizer_model = TokenizerModel.from_tokenizer(mock_tokenizer) |
| 76 | + tokenizer_model.pad_token = "[HELLO]" |
| 77 | + tokenizer = tokenizer_model.to_tokenizer() |
| 78 | + vectors = np.random.RandomState().randn(6, 10) |
| 79 | + model = StaticModel(vectors=vectors, tokenizer=tokenizer) |
| 80 | + s = StaticModelForClassification.from_static_model(model=model, pad_token="[HELLO]") |
| 81 | + assert s.w.shape[0] == vectors.shape[0] |
| 82 | + assert s.pad_id == 5 |
| 83 | + |
| 84 | + with pytest.raises(KeyError): |
| 85 | + StaticModelForClassification.from_static_model(model=model, pad_token="[BRR]") |
| 86 | + |
| 87 | + |
70 | 88 | def test_encode(mock_trained_pipeline: StaticModelForClassification) -> None: |
71 | 89 | """Test the encode function.""" |
72 | 90 | result = mock_trained_pipeline._encode(torch.tensor([[0, 1], [1, 0]]).long()) |
@@ -231,3 +249,35 @@ def test_evaluate(mock_trained_pipeline: StaticModelForClassification) -> None: |
231 | 249 | else: |
232 | 250 | # Ignore the type error since we don't support int labels in our typing, but the code does |
233 | 251 | mock_trained_pipeline.evaluate(["dog cat", "dog"], [1, 1]) # type: ignore |
| 252 | + |
| 253 | + |
| 254 | +def test_get_probable_pad_token_id(mock_tokenizer: Tokenizer, caplog: pytest.LogCaptureFixture) -> None: |
| 255 | + """Test loading from a static model with a pad token.""" |
| 256 | + tokenizer_model = TokenizerModel.from_tokenizer(mock_tokenizer) |
| 257 | + t = tokenizer_model.to_tokenizer() |
| 258 | + token_id = get_probable_pad_token_id(t) |
| 259 | + assert token_id == 0 |
| 260 | + |
| 261 | + # Adds new token |
| 262 | + tokenizer_model.pad_token = "haha" |
| 263 | + t = tokenizer_model.to_tokenizer() |
| 264 | + token_id = get_probable_pad_token_id(t) |
| 265 | + assert token_id == 5 |
| 266 | + |
| 267 | + tokenizer_model.pad_token = "word1" |
| 268 | + t = tokenizer_model.to_tokenizer() |
| 269 | + token_id = get_probable_pad_token_id(t) |
| 270 | + assert token_id == 1 |
| 271 | + |
| 272 | + # Remove padding token |
| 273 | + tokenizer_model.pad_token = None |
| 274 | + t = tokenizer_model.to_tokenizer() |
| 275 | + token_id = get_probable_pad_token_id(t) |
| 276 | + assert token_id == tokenizer_model.vocabulary["[PAD]"] |
| 277 | + |
| 278 | + tokenizer_model = tokenizer_model.remove_token_from_vocabulary("[PAD]") |
| 279 | + t = tokenizer_model.to_tokenizer() |
| 280 | + with caplog.at_level(logging.WARNING, logger="model2vec.train.utils"): |
| 281 | + token_id = get_probable_pad_token_id(t) |
| 282 | + assert token_id == 0 |
| 283 | + assert "No known pad token found, using 0 as default" in caplog.text |
0 commit comments