|
4 | 4 |
|
5 | 5 | from __future__ import annotations |
6 | 6 |
|
| 7 | +# Defer loading regular expressions until we actually need them in |
| 8 | +# parse_value(). Before that, use try_simple_decimal() to parse simple |
| 9 | +# decimal numbers. |
7 | 10 | __lazy_modules__ = ["tomllib._re"] |
8 | 11 |
|
9 | 12 | from ._re import ( |
|
41 | 44 | ) |
42 | 45 | KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'") |
43 | 46 | HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789") |
44 | | -DECDIGIT_CHARS: Final = frozenset("0123456789") |
45 | | -NUMBER_INITIAL_CHARS: Final = DECDIGIT_CHARS | frozenset("+-") |
46 | | -NUMBER_END_CHARS: Final = frozenset(",]}") | TOML_WS_AND_NEWLINE |
| 47 | + |
| 48 | +# If one of these follows a "simple decimal" it could mean that |
| 49 | +# the value is actually something else (float, datetime...), so |
| 50 | +# optimized parsing should be abandoned. |
| 51 | +ILLEGAL_AFTER_SIMPLE_DECIMAL: Final = frozenset( |
| 52 | + "eE." # decimal |
| 53 | + "xbo" # hex, bin, oct |
| 54 | + "-" # datetime |
| 55 | + ":" # localtime |
| 56 | + "_0123456789" # complex decimal |
| 57 | +) |
47 | 58 |
|
48 | 59 | BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined] |
49 | 60 | { |
@@ -668,34 +679,35 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]: |
668 | 679 | pos += 1 |
669 | 680 |
|
670 | 681 |
|
671 | | -# Sub-set of RE_NUMBER: only support decimal integer without "_" separator |
672 | 682 | def try_simple_decimal( |
673 | 683 | src: str, pos: Pos |
674 | 684 | ) -> None | tuple[Pos, int]: |
675 | | - start = pos |
676 | | - end = len(src) |
677 | | - end_chars = NUMBER_END_CHARS |
678 | | - if src[pos] in '+-': |
679 | | - pos += 1 |
680 | | - if pos >= end: |
681 | | - return None |
682 | | - if src[pos] not in DECDIGIT_CHARS: |
683 | | - return None |
| 685 | + """Parse a "simple" decimal integer. |
| 686 | +
|
| 687 | + An optimization that tries to parse a simple decimal integer |
| 688 | + without underscores. Returns `None` if there's any uncertainty |
| 689 | + on correctness. |
| 690 | + """ |
| 691 | + start_pos = pos |
684 | 692 |
|
685 | | - if src[pos] == '0': |
| 693 | + if src.startswith(("+", "-"), pos): |
686 | 694 | pos += 1 |
687 | | - if pos < end and src[pos] not in end_chars: |
688 | | - return None |
689 | | - return pos, 0 |
690 | 695 |
|
691 | | - while src[pos] in DECDIGIT_CHARS: |
| 696 | + if src.startswith("0", pos): |
692 | 697 | pos += 1 |
693 | | - if pos >= end: |
694 | | - break |
| 698 | + elif src.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9"), pos): |
| 699 | + pos = skip_chars(src, pos, "0123456789") |
695 | 700 | else: |
696 | | - if src[pos] not in end_chars: |
697 | | - return None |
698 | | - return pos, int(src[start:pos]) |
| 701 | + return None |
| 702 | + |
| 703 | + try: |
| 704 | + next_char = src[pos] |
| 705 | + except IndexError: |
| 706 | + next_char = None |
| 707 | + if next_char in ILLEGAL_AFTER_SIMPLE_DECIMAL: |
| 708 | + return None |
| 709 | + |
| 710 | + return pos, int(src[start_pos:pos]) |
699 | 711 |
|
700 | 712 |
|
701 | 713 | def parse_value( |
@@ -736,12 +748,12 @@ def parse_value( |
736 | 748 | if char == "{": |
737 | 749 | return parse_inline_table(src, pos, parse_float) |
738 | 750 |
|
739 | | - # First try a simple number parser which defers import tomllib._re |
740 | | - # to speed up tomllib import time |
741 | | - if char in NUMBER_INITIAL_CHARS: |
742 | | - res = try_simple_decimal(src, pos) |
743 | | - if res is not None: |
744 | | - return res |
| 751 | + # Try a simple parser for decimal numbers. If it's able to parse all |
| 752 | + # numbers, it avoids importing tomllib._re which has an impact on |
| 753 | + # the tomllib startup time. |
| 754 | + res = try_simple_decimal(src, pos) |
| 755 | + if res is not None: |
| 756 | + return res |
745 | 757 |
|
746 | 758 | # Dates and times |
747 | 759 | datetime_match = RE_DATETIME.match(src, pos) |
|
0 commit comments