44import warnings
55from dataclasses import dataclass , field
66
7- from transformers import AutoTokenizer , PreTrainedTokenizerBase
7+ from transformers import AutoTokenizer
8+ from transformers .tokenization_utils_base import PreTrainedTokenizerBase
89
910
1011def patch_datasets_warning ():
@@ -32,16 +33,6 @@ def filter_specific_warning(warning):
3233 frame = frame .f_back
3334 return False
3435
35- # Register the custom filter
36- warnings .filterwarnings ("ignore" , category = UserWarning , module = r".*" )
37- warnings .showwarning = (
38- lambda message , category , filename , lineno , file = None , line = None : None
39- if filter_specific_warning (
40- warnings .WarningMessage (message , category , filename , lineno )
41- )
42- else warnings .showwarning (message , category , filename , lineno )
43- )
44-
4536
4637@dataclass
4738class TokenizerMetadata :
@@ -62,7 +53,7 @@ class TokenizerMetadata:
6253 def from_tokenizer (cls , tokenizer : PreTrainedTokenizerBase , max_length : int ):
6354 """Create metadata from a tokenizer instance."""
6455 vocab_size = tokenizer .vocab_size # type: ignore
65- if not vocab_size :
56+ if not vocab_size or not isinstance ( vocab_size , int ) :
6657 raise ValueError ("The tokenizer does not have a vocab size." )
6758 return cls (
6859 vocab_size = vocab_size ,
0 commit comments