Skip to content

Commit a207fbb

Browse files
vstinnerhukkin
andcommitted
Use Taneli's implementation of try_simple_decimal()
Co-authored-by: Taneli Hukkinen <hukkinen@eurecom.fr>
1 parent 9be2a80 commit a207fbb

File tree

1 file changed

+41
-29
lines changed

1 file changed

+41
-29
lines changed

Lib/tomllib/_parser.py

Lines changed: 41 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
from __future__ import annotations
66

7+
# Defer loading regular expressions until we actually need them in
8+
# parse_value(). Before that, use try_simple_decimal() to parse simple
9+
# decimal numbers.
710
__lazy_modules__ = ["tomllib._re"]
811

912
from ._re import (
@@ -41,9 +44,17 @@
4144
)
4245
KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'")
4346
HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789")
44-
DECDIGIT_CHARS: Final = frozenset("0123456789")
45-
NUMBER_INITIAL_CHARS: Final = DECDIGIT_CHARS | frozenset("+-")
46-
NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE
47+
48+
# If one of these follows a "simple decimal" it could mean that
49+
# the value is actually something else (float, datetime...), so
50+
# optimized parsing should be abandoned.
51+
ILLEGAL_AFTER_SIMPLE_DECIMAL: Final = frozenset(
52+
"eE." # decimal
53+
"xbo" # hex, bin, oct
54+
"-" # datetime
55+
":" # localtime
56+
"_0123456789" # complex decimal
57+
)
4758

4859
BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined]
4960
{
@@ -668,34 +679,35 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]:
668679
pos += 1
669680

670681

671-
# Sub-set of RE_NUMBER: only support decimal integer without "_" separator
672682
def try_simple_decimal(
673683
src: str, pos: Pos
674684
) -> None | tuple[Pos, int]:
675-
start = pos
676-
end = len(src)
677-
end_chars = NUMBER_END_CHARS
678-
if src[pos] in '+-':
679-
pos += 1
680-
if pos >= end:
681-
return None
682-
if src[pos] not in DECDIGIT_CHARS:
683-
return None
685+
"""Parse a "simple" decimal integer.
686+
687+
An optimization that tries to parse a simple decimal integer
688+
without underscores. Returns `None` if there's any uncertainty
689+
on correctness.
690+
"""
691+
start_pos = pos
684692

685-
if src[pos] == '0':
693+
if src.startswith(("+", "-"), pos):
686694
pos += 1
687-
if pos < end and src[pos] not in end_chars:
688-
return None
689-
return pos, 0
690695

691-
while src[pos] in DECDIGIT_CHARS:
696+
if src.startswith("0", pos):
692697
pos += 1
693-
if pos >= end:
694-
break
698+
elif src.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9"), pos):
699+
pos = skip_chars(src, pos, "0123456789")
695700
else:
696-
if src[pos] not in end_chars:
697-
return None
698-
return pos, int(src[start:pos])
701+
return None
702+
703+
try:
704+
next_char = src[pos]
705+
except IndexError:
706+
next_char = None
707+
if next_char in ILLEGAL_AFTER_SIMPLE_DECIMAL:
708+
return None
709+
710+
return pos, int(src[start_pos:pos])
699711

700712

701713
def parse_value(
@@ -736,12 +748,12 @@ def parse_value(
736748
if char == "{":
737749
return parse_inline_table(src, pos, parse_float)
738750

739-
# First try a simple number parser which defers import tomllib._re
740-
# to speed up tomllib import time
741-
if char in NUMBER_INITIAL_CHARS:
742-
res = try_simple_decimal(src, pos)
743-
if res is not None:
744-
return res
751+
# Try a simple parser for decimal numbers. If it's able to parse all
752+
# numbers, it avoids importing tomllib._re which has an impact on
753+
# the tomllib startup time.
754+
res = try_simple_decimal(src, pos)
755+
if res is not None:
756+
return res
745757

746758
# Dates and times
747759
datetime_match = RE_DATETIME.match(src, pos)

0 commit comments

Comments
 (0)