From 50d57392e6fb55002834892b5df78e496945ab75 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 04:47:21 +0200 Subject: [PATCH 01/10] gh-147991: Speed up tomllib import time Defer regular expressions import until the first datetime, localtime or non-trivial number (other that just decimal digits) is met. --- Lib/test/test_tomllib/test_misc.py | 15 +++++ Lib/tomllib/_parser.py | 61 ++++++++++++++++--- ...-04-02-05-06-34.gh-issue-147991.2ANtR5.rst | 2 + 3 files changed, 69 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index 118fde24d88521..4ffcb23eb52735 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -124,3 +124,18 @@ def test_types_import(self): never imported by tests. """ importlib.import_module(f"{tomllib.__name__}._types") + + def test_parse_simple_number(self): + parse_simple_number = tomllib._parser._parse_simple_number + self.assertEqual(parse_simple_number("123", 0), (3, 123)) + self.assertEqual(parse_simple_number("123\n", 0), (3, 123)) + self.assertEqual(parse_simple_number("0\n", 0), (1, 0)) + + self.assertIsNone(parse_simple_number("0123\n", 0)) + self.assertIsNone(parse_simple_number("123-456\n", 0)) + self.assertIsNone(parse_simple_number("123:456\n", 0)) + self.assertIsNone(parse_simple_number("1.0\n", 0)) + self.assertIsNone(parse_simple_number("1_000\n", 0)) + self.assertIsNone(parse_simple_number("x123\n", 0)) + self.assertIsNone(parse_simple_number("o123\n", 0)) + self.assertIsNone(parse_simple_number("b100\n", 0)) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index b59d0f7d54bdc3..5e92bdf4a06154 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -6,15 +6,6 @@ from types import MappingProxyType -from ._re import ( - RE_DATETIME, - RE_LOCALTIME, - RE_NUMBER, - match_to_datetime, - match_to_localtime, - match_to_number, -) - TYPE_CHECKING = False if TYPE_CHECKING: from collections.abc import Iterable @@ -22,6 +13,19 @@ from ._types import Key, ParseFloat, Pos + _REGEX_IMPORTED = True + from ._re import ( + RE_DATETIME, + RE_LOCALTIME, + RE_NUMBER, + match_to_datetime, + match_to_localtime, + match_to_number, + ) +else: + # Regular expressions are lazy imported to speed up startup time + _REGEX_IMPORTED = False + ASCII_CTRL: Final = frozenset(chr(i) for i in range(32)) | frozenset(chr(127)) # Neither of these sets include quotation mark or backslash. They are @@ -41,6 +45,7 @@ ) KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'") HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789") +_DECDIGIT_CHARS: Final = frozenset("0123456789") BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType( { @@ -665,6 +670,25 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]: pos += 1 +def _parse_simple_number( + src: str, pos: Pos +) -> None | tuple[Pos, int]: + start = pos + src = src.rstrip() + end = len(src) + while src[pos] in _DECDIGIT_CHARS: + pos += 1 + if pos >= end: + break + else: + if src[pos] != "\n": + return None + digits = src[start:pos] + if digits.startswith("0") and len(digits) > 1: + return None + return pos, int(digits) + + def parse_value( src: str, pos: Pos, parse_float: ParseFloat ) -> tuple[Pos, Any]: @@ -703,6 +727,25 @@ def parse_value( if char == "{": return parse_inline_table(src, pos, parse_float) + global _REGEX_IMPORTED, RE_DATETIME, RE_LOCALTIME, RE_NUMBER + global match_to_datetime, match_to_localtime, match_to_number + if not _REGEX_IMPORTED: + # Simple number parser avoiding regex + if char in _DECDIGIT_CHARS: + res = _parse_simple_number(src, pos) + if res is not None: + return res + + from ._re import ( + RE_DATETIME, + RE_LOCALTIME, + RE_NUMBER, + match_to_datetime, + match_to_localtime, + match_to_number, + ) + _REGEX_IMPORTED = True + # Dates and times datetime_match = RE_DATETIME.match(src, pos) if datetime_match: diff --git a/Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst b/Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst new file mode 100644 index 00000000000000..581c52926c3565 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst @@ -0,0 +1,2 @@ +Improve :mod:`tomllib` import time (up to 10x faster). Patch by Victor +Stinner. From 118213520c06239316b071917c60c184843ff472 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 12:08:06 +0200 Subject: [PATCH 02/10] Use lazy import --- Lib/test/test_tomllib/test_misc.py | 26 +++++++++++++++++ Lib/tomllib/_parser.py | 47 ++++++++++-------------------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index 4ffcb23eb52735..8868b7994b4f7c 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -9,8 +9,11 @@ from pathlib import Path import sys import tempfile +import textwrap import unittest from test import support +from test.support import os_helper +from test.support.script_helper import assert_python_ok from . import tomllib @@ -139,3 +142,26 @@ def test_parse_simple_number(self): self.assertIsNone(parse_simple_number("x123\n", 0)) self.assertIsNone(parse_simple_number("o123\n", 0)) self.assertIsNone(parse_simple_number("b100\n", 0)) + + def test_lazy_import(self): + # Test that _parse_simple_number() can parse the TOML file without + # importing regular expressions (tomllib._re) + filename = os_helper.TESTFN + self.addCleanup(os_helper.unlink, filename) + toml = textwrap.dedent(""" + [metadata] + int = 123 + list = [+1, -2, 3] + table = {x=1, y=2} + """) + with open(filename, "w") as fp: + fp.write(toml) + + code = textwrap.dedent(f""" + import sys, tomllib + with open({filename!a}, "rb") as fp: + tomllib.load(fp) + print("lazy import?", 'tomllib._re' not in sys.modules) + """) + proc = assert_python_ok('-c', code) + self.assertIn(b'lazy import? True', proc.out) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index 5e92bdf4a06154..1ca263619b726b 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -6,6 +6,17 @@ from types import MappingProxyType +__lazy_modules__ = ["tomllib._re"] + +from ._re import ( + RE_DATETIME, + RE_LOCALTIME, + RE_NUMBER, + match_to_datetime, + match_to_localtime, + match_to_number, +) + TYPE_CHECKING = False if TYPE_CHECKING: from collections.abc import Iterable @@ -13,19 +24,6 @@ from ._types import Key, ParseFloat, Pos - _REGEX_IMPORTED = True - from ._re import ( - RE_DATETIME, - RE_LOCALTIME, - RE_NUMBER, - match_to_datetime, - match_to_localtime, - match_to_number, - ) -else: - # Regular expressions are lazy imported to speed up startup time - _REGEX_IMPORTED = False - ASCII_CTRL: Final = frozenset(chr(i) for i in range(32)) | frozenset(chr(127)) # Neither of these sets include quotation mark or backslash. They are @@ -727,24 +725,11 @@ def parse_value( if char == "{": return parse_inline_table(src, pos, parse_float) - global _REGEX_IMPORTED, RE_DATETIME, RE_LOCALTIME, RE_NUMBER - global match_to_datetime, match_to_localtime, match_to_number - if not _REGEX_IMPORTED: - # Simple number parser avoiding regex - if char in _DECDIGIT_CHARS: - res = _parse_simple_number(src, pos) - if res is not None: - return res - - from ._re import ( - RE_DATETIME, - RE_LOCALTIME, - RE_NUMBER, - match_to_datetime, - match_to_localtime, - match_to_number, - ) - _REGEX_IMPORTED = True + # Simple number parser avoiding regex + if char in _DECDIGIT_CHARS: + res = _parse_simple_number(src, pos) + if res is not None: + return res # Dates and times datetime_match = RE_DATETIME.match(src, pos) From 4899c2e78c91a350f908891a3adea7622bc1f593 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 12:22:29 +0200 Subject: [PATCH 03/10] Enhance _parse_simple_number() to handle more cases --- Lib/test/test_tomllib/test_misc.py | 21 ++++++++++++++++++--- Lib/tomllib/_parser.py | 29 +++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index 8868b7994b4f7c..00e7e693964495 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -132,11 +132,26 @@ def test_parse_simple_number(self): parse_simple_number = tomllib._parser._parse_simple_number self.assertEqual(parse_simple_number("123", 0), (3, 123)) self.assertEqual(parse_simple_number("123\n", 0), (3, 123)) + self.assertEqual(parse_simple_number("123 456", 0), (3, 123)) + self.assertEqual(parse_simple_number("+123\n", 0), (4, 123)) + self.assertEqual(parse_simple_number("-123\n", 0), (4, -123)) self.assertEqual(parse_simple_number("0\n", 0), (1, 0)) - + self.assertEqual(parse_simple_number("+0\n", 0), (2, 0)) + self.assertEqual(parse_simple_number("-0\n", 0), (2, 0)) + self.assertEqual(parse_simple_number("[23]\n", 1), (3, 23)) + self.assertEqual(parse_simple_number("[23, 24]\n", 1), (3, 23)) + self.assertEqual(parse_simple_number("[23]\n", 1), (3, 23)) + self.assertEqual(parse_simple_number("{x = 42}\n", 5), (7, 42)) + + self.assertIsNone(parse_simple_number("+", 0), None) + self.assertIsNone(parse_simple_number("-", 0), None) + self.assertIsNone(parse_simple_number("+\n", 0), None) + self.assertIsNone(parse_simple_number("-\n", 0), None) + self.assertIsNone(parse_simple_number("+inf\n", 0), None) + self.assertIsNone(parse_simple_number("-nan\n", 0), None) self.assertIsNone(parse_simple_number("0123\n", 0)) - self.assertIsNone(parse_simple_number("123-456\n", 0)) - self.assertIsNone(parse_simple_number("123:456\n", 0)) + self.assertIsNone(parse_simple_number("1979-05-27\n", 0)) + self.assertIsNone(parse_simple_number("12:32:00\n", 0)) self.assertIsNone(parse_simple_number("1.0\n", 0)) self.assertIsNone(parse_simple_number("1_000\n", 0)) self.assertIsNone(parse_simple_number("x123\n", 0)) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index 1ca263619b726b..770fe32bf2089b 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -44,6 +44,8 @@ KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'") HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789") _DECDIGIT_CHARS: Final = frozenset("0123456789") +_NUMBER_INITIAL_CHARS: Final = _DECDIGIT_CHARS | frozenset("+-") +_NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType( { @@ -672,19 +674,29 @@ def _parse_simple_number( src: str, pos: Pos ) -> None | tuple[Pos, int]: start = pos - src = src.rstrip() end = len(src) + end_chars = _NUMBER_END_CHARS + if src[pos] in '+-': + pos += 1 + if pos >= end: + return None + if src[pos] not in _DECDIGIT_CHARS: + return None + + if src[pos] == '0': + pos += 1 + if pos < end and src[pos] not in end_chars: + return None + return pos, 0 + while src[pos] in _DECDIGIT_CHARS: pos += 1 if pos >= end: break else: - if src[pos] != "\n": + if src[pos] not in end_chars: return None - digits = src[start:pos] - if digits.startswith("0") and len(digits) > 1: - return None - return pos, int(digits) + return pos, int(src[start:pos]) def parse_value( @@ -725,8 +737,9 @@ def parse_value( if char == "{": return parse_inline_table(src, pos, parse_float) - # Simple number parser avoiding regex - if char in _DECDIGIT_CHARS: + # First try a simple number parser which defers import tomllib._re + # to speed up tomllib import time + if char in _NUMBER_INITIAL_CHARS: res = _parse_simple_number(src, pos) if res is not None: return res From 4d11aa98995e6dad19b0f2aeaa34a9073a681f24 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 12:40:22 +0200 Subject: [PATCH 04/10] Remove duplicated test --- Lib/test/test_tomllib/test_misc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index 00e7e693964495..c5b0e48b44adae 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -140,7 +140,6 @@ def test_parse_simple_number(self): self.assertEqual(parse_simple_number("-0\n", 0), (2, 0)) self.assertEqual(parse_simple_number("[23]\n", 1), (3, 23)) self.assertEqual(parse_simple_number("[23, 24]\n", 1), (3, 23)) - self.assertEqual(parse_simple_number("[23]\n", 1), (3, 23)) self.assertEqual(parse_simple_number("{x = 42}\n", 5), (7, 42)) self.assertIsNone(parse_simple_number("+", 0), None) From ce57c76f0aa500530564c24b1a445f13af36acef Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 12:44:11 +0200 Subject: [PATCH 05/10] Replace types.MappingProxyType with frozendict --- Lib/tomllib/_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index 770fe32bf2089b..363e4d4927e7de 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -4,8 +4,6 @@ from __future__ import annotations -from types import MappingProxyType - __lazy_modules__ = ["tomllib._re"] from ._re import ( @@ -47,7 +45,7 @@ _NUMBER_INITIAL_CHARS: Final = _DECDIGIT_CHARS | frozenset("+-") _NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE -BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType( +BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined] { "\\b": "\u0008", # backspace "\\t": "\u0009", # tab From 63fd2b185a3a2fe165168594e28d0f537c5057b0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 17:11:45 +0200 Subject: [PATCH 06/10] Use public names --- Lib/test/test_tomllib/test_misc.py | 4 ++-- Lib/tomllib/_parser.py | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index c5b0e48b44adae..a23c713cbb6448 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -129,7 +129,7 @@ def test_types_import(self): importlib.import_module(f"{tomllib.__name__}._types") def test_parse_simple_number(self): - parse_simple_number = tomllib._parser._parse_simple_number + parse_simple_number = tomllib._parser.parse_simple_number self.assertEqual(parse_simple_number("123", 0), (3, 123)) self.assertEqual(parse_simple_number("123\n", 0), (3, 123)) self.assertEqual(parse_simple_number("123 456", 0), (3, 123)) @@ -158,7 +158,7 @@ def test_parse_simple_number(self): self.assertIsNone(parse_simple_number("b100\n", 0)) def test_lazy_import(self): - # Test that _parse_simple_number() can parse the TOML file without + # Test that parse_simple_number() can parse the TOML file without # importing regular expressions (tomllib._re) filename = os_helper.TESTFN self.addCleanup(os_helper.unlink, filename) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index 363e4d4927e7de..13085cb19bbe97 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -41,9 +41,9 @@ ) KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'") HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789") -_DECDIGIT_CHARS: Final = frozenset("0123456789") -_NUMBER_INITIAL_CHARS: Final = _DECDIGIT_CHARS | frozenset("+-") -_NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE +DECDIGIT_CHARS: Final = frozenset("0123456789") +NUMBER_INITIAL_CHARS: Final = DECDIGIT_CHARS | frozenset("+-") +NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined] { @@ -668,17 +668,18 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]: pos += 1 -def _parse_simple_number( +# Sub-set of RE_NUMBER: only support decimal integer without "_" separator +def parse_simple_number( src: str, pos: Pos ) -> None | tuple[Pos, int]: start = pos end = len(src) - end_chars = _NUMBER_END_CHARS + end_chars = NUMBER_END_CHARS if src[pos] in '+-': pos += 1 if pos >= end: return None - if src[pos] not in _DECDIGIT_CHARS: + if src[pos] not in DECDIGIT_CHARS: return None if src[pos] == '0': @@ -687,7 +688,7 @@ def _parse_simple_number( return None return pos, 0 - while src[pos] in _DECDIGIT_CHARS: + while src[pos] in DECDIGIT_CHARS: pos += 1 if pos >= end: break @@ -737,8 +738,8 @@ def parse_value( # First try a simple number parser which defers import tomllib._re # to speed up tomllib import time - if char in _NUMBER_INITIAL_CHARS: - res = _parse_simple_number(src, pos) + if char in NUMBER_INITIAL_CHARS: + res = parse_simple_number(src, pos) if res is not None: return res From ed127fabc6cdcdf475a3ec3f84d2abdb04d9f8ee Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 2 Apr 2026 17:13:02 +0200 Subject: [PATCH 07/10] Fix tests on hex/oct/bin numbers --- Lib/test/test_tomllib/test_misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index a23c713cbb6448..304203986a7821 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -153,9 +153,9 @@ def test_parse_simple_number(self): self.assertIsNone(parse_simple_number("12:32:00\n", 0)) self.assertIsNone(parse_simple_number("1.0\n", 0)) self.assertIsNone(parse_simple_number("1_000\n", 0)) - self.assertIsNone(parse_simple_number("x123\n", 0)) - self.assertIsNone(parse_simple_number("o123\n", 0)) - self.assertIsNone(parse_simple_number("b100\n", 0)) + self.assertIsNone(parse_simple_number("0x123\n", 0)) + self.assertIsNone(parse_simple_number("0o123\n", 0)) + self.assertIsNone(parse_simple_number("0b100\n", 0)) def test_lazy_import(self): # Test that parse_simple_number() can parse the TOML file without From 9be2a80009fff08de312b6ec8b24b2df77ee0188 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 3 Apr 2026 00:11:48 +0200 Subject: [PATCH 08/10] Rename to try_simple_decimal() --- Lib/test/test_tomllib/test_misc.py | 58 +++++++++++++++--------------- Lib/tomllib/_parser.py | 4 +-- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index 304203986a7821..bd257302255992 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -128,37 +128,37 @@ def test_types_import(self): """ importlib.import_module(f"{tomllib.__name__}._types") - def test_parse_simple_number(self): - parse_simple_number = tomllib._parser.parse_simple_number - self.assertEqual(parse_simple_number("123", 0), (3, 123)) - self.assertEqual(parse_simple_number("123\n", 0), (3, 123)) - self.assertEqual(parse_simple_number("123 456", 0), (3, 123)) - self.assertEqual(parse_simple_number("+123\n", 0), (4, 123)) - self.assertEqual(parse_simple_number("-123\n", 0), (4, -123)) - self.assertEqual(parse_simple_number("0\n", 0), (1, 0)) - self.assertEqual(parse_simple_number("+0\n", 0), (2, 0)) - self.assertEqual(parse_simple_number("-0\n", 0), (2, 0)) - self.assertEqual(parse_simple_number("[23]\n", 1), (3, 23)) - self.assertEqual(parse_simple_number("[23, 24]\n", 1), (3, 23)) - self.assertEqual(parse_simple_number("{x = 42}\n", 5), (7, 42)) - - self.assertIsNone(parse_simple_number("+", 0), None) - self.assertIsNone(parse_simple_number("-", 0), None) - self.assertIsNone(parse_simple_number("+\n", 0), None) - self.assertIsNone(parse_simple_number("-\n", 0), None) - self.assertIsNone(parse_simple_number("+inf\n", 0), None) - self.assertIsNone(parse_simple_number("-nan\n", 0), None) - self.assertIsNone(parse_simple_number("0123\n", 0)) - self.assertIsNone(parse_simple_number("1979-05-27\n", 0)) - self.assertIsNone(parse_simple_number("12:32:00\n", 0)) - self.assertIsNone(parse_simple_number("1.0\n", 0)) - self.assertIsNone(parse_simple_number("1_000\n", 0)) - self.assertIsNone(parse_simple_number("0x123\n", 0)) - self.assertIsNone(parse_simple_number("0o123\n", 0)) - self.assertIsNone(parse_simple_number("0b100\n", 0)) + def test_try_simple_decimal(self): + try_simple_decimal = tomllib._parser.try_simple_decimal + self.assertEqual(try_simple_decimal("123", 0), (3, 123)) + self.assertEqual(try_simple_decimal("123\n", 0), (3, 123)) + self.assertEqual(try_simple_decimal("123 456", 0), (3, 123)) + self.assertEqual(try_simple_decimal("+123\n", 0), (4, 123)) + self.assertEqual(try_simple_decimal("-123\n", 0), (4, -123)) + self.assertEqual(try_simple_decimal("0\n", 0), (1, 0)) + self.assertEqual(try_simple_decimal("+0\n", 0), (2, 0)) + self.assertEqual(try_simple_decimal("-0\n", 0), (2, 0)) + self.assertEqual(try_simple_decimal("[23]\n", 1), (3, 23)) + self.assertEqual(try_simple_decimal("[23, 24]\n", 1), (3, 23)) + self.assertEqual(try_simple_decimal("{x = 42}\n", 5), (7, 42)) + + self.assertIsNone(try_simple_decimal("+", 0), None) + self.assertIsNone(try_simple_decimal("-", 0), None) + self.assertIsNone(try_simple_decimal("+\n", 0), None) + self.assertIsNone(try_simple_decimal("-\n", 0), None) + self.assertIsNone(try_simple_decimal("+inf\n", 0), None) + self.assertIsNone(try_simple_decimal("-nan\n", 0), None) + self.assertIsNone(try_simple_decimal("0123\n", 0)) + self.assertIsNone(try_simple_decimal("1979-05-27\n", 0)) + self.assertIsNone(try_simple_decimal("12:32:00\n", 0)) + self.assertIsNone(try_simple_decimal("1.0\n", 0)) + self.assertIsNone(try_simple_decimal("1_000\n", 0)) + self.assertIsNone(try_simple_decimal("0x123\n", 0)) + self.assertIsNone(try_simple_decimal("0o123\n", 0)) + self.assertIsNone(try_simple_decimal("0b100\n", 0)) def test_lazy_import(self): - # Test that parse_simple_number() can parse the TOML file without + # Test that try_simple_decimal() can parse the TOML file without # importing regular expressions (tomllib._re) filename = os_helper.TESTFN self.addCleanup(os_helper.unlink, filename) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index 13085cb19bbe97..a5c7fe191547b6 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -669,7 +669,7 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]: # Sub-set of RE_NUMBER: only support decimal integer without "_" separator -def parse_simple_number( +def try_simple_decimal( src: str, pos: Pos ) -> None | tuple[Pos, int]: start = pos @@ -739,7 +739,7 @@ def parse_value( # First try a simple number parser which defers import tomllib._re # to speed up tomllib import time if char in NUMBER_INITIAL_CHARS: - res = parse_simple_number(src, pos) + res = try_simple_decimal(src, pos) if res is not None: return res From a207fbb2462b7d4bd6336fe53edca3a23e23f75e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 3 Apr 2026 00:18:05 +0200 Subject: [PATCH 09/10] Use Taneli's implementation of try_simple_decimal() Co-authored-by: Taneli Hukkinen --- Lib/tomllib/_parser.py | 70 +++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index a5c7fe191547b6..de5e36434a0ba4 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -4,6 +4,9 @@ from __future__ import annotations +# Defer loading regular expressions until we actually need them in +# parse_value(). Before that, use try_simple_decimal() to parse simple +# decimal numbers. __lazy_modules__ = ["tomllib._re"] from ._re import ( @@ -41,9 +44,17 @@ ) KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'") HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789") -DECDIGIT_CHARS: Final = frozenset("0123456789") -NUMBER_INITIAL_CHARS: Final = DECDIGIT_CHARS | frozenset("+-") -NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE + +# If one of these follows a "simple decimal" it could mean that +# the value is actually something else (float, datetime...), so +# optimized parsing should be abandoned. +ILLEGAL_AFTER_SIMPLE_DECIMAL: Final = frozenset( + "eE." # decimal + "xbo" # hex, bin, oct + "-" # datetime + ":" # localtime + "_0123456789" # complex decimal +) BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined] { @@ -668,34 +679,35 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]: pos += 1 -# Sub-set of RE_NUMBER: only support decimal integer without "_" separator def try_simple_decimal( src: str, pos: Pos ) -> None | tuple[Pos, int]: - start = pos - end = len(src) - end_chars = NUMBER_END_CHARS - if src[pos] in '+-': - pos += 1 - if pos >= end: - return None - if src[pos] not in DECDIGIT_CHARS: - return None + """Parse a "simple" decimal integer. + + An optimization that tries to parse a simple decimal integer + without underscores. Returns `None` if there's any uncertainty + on correctness. + """ + start_pos = pos - if src[pos] == '0': + if src.startswith(("+", "-"), pos): pos += 1 - if pos < end and src[pos] not in end_chars: - return None - return pos, 0 - while src[pos] in DECDIGIT_CHARS: + if src.startswith("0", pos): pos += 1 - if pos >= end: - break + elif src.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9"), pos): + pos = skip_chars(src, pos, "0123456789") else: - if src[pos] not in end_chars: - return None - return pos, int(src[start:pos]) + return None + + try: + next_char = src[pos] + except IndexError: + next_char = None + if next_char in ILLEGAL_AFTER_SIMPLE_DECIMAL: + return None + + return pos, int(src[start_pos:pos]) def parse_value( @@ -736,12 +748,12 @@ def parse_value( if char == "{": return parse_inline_table(src, pos, parse_float) - # First try a simple number parser which defers import tomllib._re - # to speed up tomllib import time - if char in NUMBER_INITIAL_CHARS: - res = try_simple_decimal(src, pos) - if res is not None: - return res + # Try a simple parser for decimal numbers. If it's able to parse all + # numbers, it avoids importing tomllib._re which has an impact on + # the tomllib startup time. + res = try_simple_decimal(src, pos) + if res is not None: + return res # Dates and times datetime_match = RE_DATETIME.match(src, pos) From 82d4e2b3d3edf59a256b0840d3e49247495cae1f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 3 Apr 2026 00:22:47 +0200 Subject: [PATCH 10/10] Rename res to number --- Lib/tomllib/_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index de5e36434a0ba4..ede6859a64396b 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -751,9 +751,9 @@ def parse_value( # Try a simple parser for decimal numbers. If it's able to parse all # numbers, it avoids importing tomllib._re which has an impact on # the tomllib startup time. - res = try_simple_decimal(src, pos) - if res is not None: - return res + number = try_simple_decimal(src, pos) + if number is not None: + return number # Dates and times datetime_match = RE_DATETIME.match(src, pos)