diff --git a/Lib/test/test_tomllib/test_misc.py b/Lib/test/test_tomllib/test_misc.py index 118fde24d88521..bd257302255992 100644 --- a/Lib/test/test_tomllib/test_misc.py +++ b/Lib/test/test_tomllib/test_misc.py @@ -9,8 +9,11 @@ from pathlib import Path import sys import tempfile +import textwrap import unittest from test import support +from test.support import os_helper +from test.support.script_helper import assert_python_ok from . import tomllib @@ -124,3 +127,55 @@ def test_types_import(self): never imported by tests. """ importlib.import_module(f"{tomllib.__name__}._types") + + def test_try_simple_decimal(self): + try_simple_decimal = tomllib._parser.try_simple_decimal + self.assertEqual(try_simple_decimal("123", 0), (3, 123)) + self.assertEqual(try_simple_decimal("123\n", 0), (3, 123)) + self.assertEqual(try_simple_decimal("123 456", 0), (3, 123)) + self.assertEqual(try_simple_decimal("+123\n", 0), (4, 123)) + self.assertEqual(try_simple_decimal("-123\n", 0), (4, -123)) + self.assertEqual(try_simple_decimal("0\n", 0), (1, 0)) + self.assertEqual(try_simple_decimal("+0\n", 0), (2, 0)) + self.assertEqual(try_simple_decimal("-0\n", 0), (2, 0)) + self.assertEqual(try_simple_decimal("[23]\n", 1), (3, 23)) + self.assertEqual(try_simple_decimal("[23, 24]\n", 1), (3, 23)) + self.assertEqual(try_simple_decimal("{x = 42}\n", 5), (7, 42)) + + self.assertIsNone(try_simple_decimal("+", 0), None) + self.assertIsNone(try_simple_decimal("-", 0), None) + self.assertIsNone(try_simple_decimal("+\n", 0), None) + self.assertIsNone(try_simple_decimal("-\n", 0), None) + self.assertIsNone(try_simple_decimal("+inf\n", 0), None) + self.assertIsNone(try_simple_decimal("-nan\n", 0), None) + self.assertIsNone(try_simple_decimal("0123\n", 0)) + self.assertIsNone(try_simple_decimal("1979-05-27\n", 0)) + self.assertIsNone(try_simple_decimal("12:32:00\n", 0)) + self.assertIsNone(try_simple_decimal("1.0\n", 0)) + self.assertIsNone(try_simple_decimal("1_000\n", 0)) + self.assertIsNone(try_simple_decimal("0x123\n", 0)) + self.assertIsNone(try_simple_decimal("0o123\n", 0)) + self.assertIsNone(try_simple_decimal("0b100\n", 0)) + + def test_lazy_import(self): + # Test that try_simple_decimal() can parse the TOML file without + # importing regular expressions (tomllib._re) + filename = os_helper.TESTFN + self.addCleanup(os_helper.unlink, filename) + toml = textwrap.dedent(""" + [metadata] + int = 123 + list = [+1, -2, 3] + table = {x=1, y=2} + """) + with open(filename, "w") as fp: + fp.write(toml) + + code = textwrap.dedent(f""" + import sys, tomllib + with open({filename!a}, "rb") as fp: + tomllib.load(fp) + print("lazy import?", 'tomllib._re' not in sys.modules) + """) + proc = assert_python_ok('-c', code) + self.assertIn(b'lazy import? True', proc.out) diff --git a/Lib/tomllib/_parser.py b/Lib/tomllib/_parser.py index b59d0f7d54bdc3..ede6859a64396b 100644 --- a/Lib/tomllib/_parser.py +++ b/Lib/tomllib/_parser.py @@ -4,7 +4,10 @@ from __future__ import annotations -from types import MappingProxyType +# Defer loading regular expressions until we actually need them in +# parse_value(). Before that, use try_simple_decimal() to parse simple +# decimal numbers. +__lazy_modules__ = ["tomllib._re"] from ._re import ( RE_DATETIME, @@ -42,7 +45,18 @@ KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'") HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789") -BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType( +# If one of these follows a "simple decimal" it could mean that +# the value is actually something else (float, datetime...), so +# optimized parsing should be abandoned. +ILLEGAL_AFTER_SIMPLE_DECIMAL: Final = frozenset( + "eE." # decimal + "xbo" # hex, bin, oct + "-" # datetime + ":" # localtime + "_0123456789" # complex decimal +) + +BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined] { "\\b": "\u0008", # backspace "\\t": "\u0009", # tab @@ -665,6 +679,37 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]: pos += 1 +def try_simple_decimal( + src: str, pos: Pos +) -> None | tuple[Pos, int]: + """Parse a "simple" decimal integer. + + An optimization that tries to parse a simple decimal integer + without underscores. Returns `None` if there's any uncertainty + on correctness. + """ + start_pos = pos + + if src.startswith(("+", "-"), pos): + pos += 1 + + if src.startswith("0", pos): + pos += 1 + elif src.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9"), pos): + pos = skip_chars(src, pos, "0123456789") + else: + return None + + try: + next_char = src[pos] + except IndexError: + next_char = None + if next_char in ILLEGAL_AFTER_SIMPLE_DECIMAL: + return None + + return pos, int(src[start_pos:pos]) + + def parse_value( src: str, pos: Pos, parse_float: ParseFloat ) -> tuple[Pos, Any]: @@ -703,6 +748,13 @@ def parse_value( if char == "{": return parse_inline_table(src, pos, parse_float) + # Try a simple parser for decimal numbers. If it's able to parse all + # numbers, it avoids importing tomllib._re which has an impact on + # the tomllib startup time. + number = try_simple_decimal(src, pos) + if number is not None: + return number + # Dates and times datetime_match = RE_DATETIME.match(src, pos) if datetime_match: diff --git a/Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst b/Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst new file mode 100644 index 00000000000000..581c52926c3565 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-04-02-05-06-34.gh-issue-147991.2ANtR5.rst @@ -0,0 +1,2 @@ +Improve :mod:`tomllib` import time (up to 10x faster). Patch by Victor +Stinner.