Skip to content

Commit f074da7

Browse files
committed
Resolve .gitignore merge conflict
Keep both reference repositories section and standard Python gitignore structure. Co-authored-by: Justar96
1 parent 1cc45a9 commit f074da7

16 files changed

Lines changed: 1555 additions & 200 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Reference repositories
2+
!ptoon-reference/
3+
14
# Byte-compiled / optimized / DLL files
25
__pycache__/
36
*.py[cod]

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ authors = [
77
{ name = "Johann Schopplich", email = "hello@johannschopplich.com" }
88
]
99
requires-python = ">=3.8"
10-
dependencies = []
10+
dependencies = [
11+
"typing-extensions>=4.0.0; python_version < '3.10'",
12+
]
1113
license = { text = "MIT" }
1214
keywords = ["toon", "serialization", "llm", "data-format", "token-efficient"]
1315
classifiers = [
@@ -40,6 +42,7 @@ dev = [
4042
"pytest-cov>=4.1.0",
4143
"ruff>=0.8.0",
4244
"mypy>=1.8.0",
45+
"black>=24.8.0",
4346
]
4447

4548
[tool.pytest.ini_options]

src/toon_format/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .decoder import ToonDecodeError, decode
99
from .encoder import encode
1010
from .types import DecodeOptions, Delimiter, DelimiterKey, EncodeOptions
11+
from .utils import compare_formats, count_tokens, estimate_savings
1112

1213
__version__ = "0.1.1"
1314
__all__ = [
@@ -18,4 +19,7 @@
1819
"DelimiterKey",
1920
"EncodeOptions",
2021
"DecodeOptions",
22+
"count_tokens",
23+
"estimate_savings",
24+
"compare_formats",
2125
]

src/toon_format/_literal_utils.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Utilities for detecting literal token types.
2+
3+
This module provides functions to identify different types of literal
4+
values in TOON syntax, such as booleans, null, and numeric literals.
5+
"""
6+
7+
from .constants import FALSE_LITERAL, NULL_LITERAL, TRUE_LITERAL
8+
9+
10+
def is_boolean_or_null_literal(token: str) -> bool:
11+
"""Check if a token is a boolean or null literal (`true`, `false`, `null`).
12+
13+
Args:
14+
token: The token to check
15+
16+
Returns:
17+
True if the token is a boolean or null literal
18+
19+
Examples:
20+
>>> is_boolean_or_null_literal("true")
21+
True
22+
>>> is_boolean_or_null_literal("null")
23+
True
24+
>>> is_boolean_or_null_literal("hello")
25+
False
26+
"""
27+
return token == TRUE_LITERAL or token == FALSE_LITERAL or token == NULL_LITERAL
28+
29+
30+
def is_numeric_literal(token: str) -> bool:
31+
"""Check if a token represents a valid numeric literal.
32+
33+
Rejects numbers with leading zeros (except `"0"` itself or decimals like `"0.5"`).
34+
Per Section 7.3 of the TOON specification.
35+
36+
Args:
37+
token: The token to check
38+
39+
Returns:
40+
True if the token is a valid numeric literal
41+
42+
Examples:
43+
>>> is_numeric_literal("42")
44+
True
45+
>>> is_numeric_literal("3.14")
46+
True
47+
>>> is_numeric_literal("0.5")
48+
True
49+
>>> is_numeric_literal("0123") # Leading zero - not valid
50+
False
51+
>>> is_numeric_literal("hello")
52+
False
53+
"""
54+
if not token:
55+
return False
56+
57+
# Must not have leading zeros (except for `"0"` itself or decimals like `"0.5"`)
58+
if len(token) > 1 and token[0] == "0" and token[1] != ".":
59+
return False
60+
61+
# Check if it's a valid number
62+
try:
63+
num = float(token)
64+
# Reject NaN and infinity
65+
return not (num != num or not (-float("inf") < num < float("inf")))
66+
except ValueError:
67+
return False

src/toon_format/_scanner.py

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
"""Scanner for parsing TOON input into lines with depth information.
2+
3+
This module implements the first stage of the TOON decoding pipeline:
4+
scanning the input text and converting it into structured line objects
5+
with depth and indentation metadata.
6+
"""
7+
8+
from dataclasses import dataclass
9+
from typing import List, Optional, Tuple
10+
11+
from .constants import SPACE, TAB
12+
13+
14+
@dataclass
15+
class ParsedLine:
16+
"""A parsed line with metadata.
17+
18+
Attributes:
19+
raw: The original raw line content
20+
depth: The indentation depth (number of indent levels)
21+
indent: The number of leading spaces
22+
content: The line content after removing indentation
23+
line_num: The 1-based line number in the source
24+
"""
25+
26+
raw: str
27+
depth: int
28+
indent: int
29+
content: str
30+
line_num: int
31+
32+
33+
@dataclass
34+
class BlankLineInfo:
35+
"""Information about a blank line.
36+
37+
Attributes:
38+
line_num: The 1-based line number
39+
indent: The number of leading spaces
40+
depth: The computed indentation depth
41+
"""
42+
43+
line_num: int
44+
indent: int
45+
depth: int
46+
47+
48+
class LineCursor:
49+
"""Iterator-like class for traversing parsed lines.
50+
51+
Provides methods to peek at the current line, advance to the next line,
52+
and check for lines at specific depths. This abstraction makes the decoder
53+
logic cleaner and easier to test.
54+
"""
55+
56+
def __init__(
57+
self,
58+
lines: List[ParsedLine],
59+
blank_lines: Optional[List[BlankLineInfo]] = None,
60+
) -> None:
61+
"""Initialize a line cursor.
62+
63+
Args:
64+
lines: The parsed lines to traverse
65+
blank_lines: Optional list of blank line information
66+
"""
67+
self._lines = lines
68+
self._index = 0
69+
self._blank_lines = blank_lines or []
70+
71+
def get_blank_lines(self) -> List[BlankLineInfo]:
72+
"""Get the list of blank lines."""
73+
return self._blank_lines
74+
75+
def peek(self) -> Optional[ParsedLine]:
76+
"""Peek at the current line without advancing.
77+
78+
Returns:
79+
The current line, or None if at end
80+
"""
81+
if self._index >= len(self._lines):
82+
return None
83+
return self._lines[self._index]
84+
85+
def next(self) -> Optional[ParsedLine]:
86+
"""Get the current line and advance.
87+
88+
Returns:
89+
The current line, or None if at end
90+
"""
91+
if self._index >= len(self._lines):
92+
return None
93+
line = self._lines[self._index]
94+
self._index += 1
95+
return line
96+
97+
def current(self) -> Optional[ParsedLine]:
98+
"""Get the most recently consumed line.
99+
100+
Returns:
101+
The previous line, or None if no line has been consumed
102+
"""
103+
if self._index > 0:
104+
return self._lines[self._index - 1]
105+
return None
106+
107+
def advance(self) -> None:
108+
"""Advance to the next line."""
109+
self._index += 1
110+
111+
def at_end(self) -> bool:
112+
"""Check if cursor is at the end of lines.
113+
114+
Returns:
115+
True if at end
116+
"""
117+
return self._index >= len(self._lines)
118+
119+
@property
120+
def length(self) -> int:
121+
"""Get the total number of lines."""
122+
return len(self._lines)
123+
124+
def peek_at_depth(self, target_depth: int) -> Optional[ParsedLine]:
125+
"""Peek at the next line at a specific depth.
126+
127+
Args:
128+
target_depth: The target depth
129+
130+
Returns:
131+
The line if it matches the depth, None otherwise
132+
"""
133+
line = self.peek()
134+
if not line or line.depth < target_depth:
135+
return None
136+
if line.depth == target_depth:
137+
return line
138+
return None
139+
140+
def has_more_at_depth(self, target_depth: int) -> bool:
141+
"""Check if there are more lines at a specific depth.
142+
143+
Args:
144+
target_depth: The target depth
145+
146+
Returns:
147+
True if there are more lines at the target depth
148+
"""
149+
return self.peek_at_depth(target_depth) is not None
150+
151+
152+
def to_parsed_lines(
153+
source: str,
154+
indent_size: int,
155+
strict: bool,
156+
) -> Tuple[List[ParsedLine], List[BlankLineInfo]]:
157+
"""Convert source string to parsed lines with depth information.
158+
159+
Per Section 12 of the TOON specification for indentation handling.
160+
This is the entry point for the scanning stage of the decoder pipeline.
161+
162+
Args:
163+
source: The source string to parse
164+
indent_size: The number of spaces per indentation level
165+
strict: Whether to enforce strict indentation validation
166+
167+
Returns:
168+
A tuple of (parsed_lines, blank_lines)
169+
170+
Raises:
171+
SyntaxError: If strict mode validation fails (tabs in indentation, invalid spacing)
172+
173+
Examples:
174+
>>> lines, blanks = to_parsed_lines("name: Alice\\n age: 30", 2, True)
175+
>>> lines[0].content
176+
'name: Alice'
177+
>>> lines[1].depth
178+
1
179+
"""
180+
if not source.strip():
181+
return [], []
182+
183+
lines = source.split("\n")
184+
parsed: List[ParsedLine] = []
185+
blank_lines: List[BlankLineInfo] = []
186+
187+
for i, raw in enumerate(lines):
188+
line_num = i + 1
189+
indent = 0
190+
while indent < len(raw) and raw[indent] == SPACE:
191+
indent += 1
192+
193+
content = raw[indent:]
194+
195+
# Track blank lines
196+
if not content.strip():
197+
depth = _compute_depth_from_indent(indent, indent_size)
198+
blank_lines.append(
199+
BlankLineInfo(
200+
line_num=line_num,
201+
indent=indent,
202+
depth=depth,
203+
)
204+
)
205+
continue
206+
207+
depth = _compute_depth_from_indent(indent, indent_size)
208+
209+
# Strict mode validation
210+
if strict:
211+
# Find the full leading whitespace region (spaces and tabs)
212+
ws_end = 0
213+
while ws_end < len(raw) and (raw[ws_end] == SPACE or raw[ws_end] == TAB):
214+
ws_end += 1
215+
216+
# Check for tabs in leading whitespace (before actual content)
217+
if TAB in raw[:ws_end]:
218+
raise SyntaxError(
219+
f"Line {line_num}: Tabs not allowed in indentation in strict mode"
220+
)
221+
222+
# Check for exact multiples of indent_size
223+
if indent > 0 and indent % indent_size != 0:
224+
raise SyntaxError(
225+
f"Line {line_num}: Indent must be exact multiple of {indent_size}, "
226+
f"but found {indent} spaces"
227+
)
228+
229+
parsed.append(
230+
ParsedLine(
231+
raw=raw,
232+
indent=indent,
233+
content=content,
234+
depth=depth,
235+
line_num=line_num,
236+
)
237+
)
238+
239+
return parsed, blank_lines
240+
241+
242+
def _compute_depth_from_indent(indent_spaces: int, indent_size: int) -> int:
243+
"""Compute depth from indentation spaces.
244+
245+
Args:
246+
indent_spaces: Number of leading spaces
247+
indent_size: Number of spaces per indentation level
248+
249+
Returns:
250+
The computed depth
251+
252+
Examples:
253+
>>> _compute_depth_from_indent(0, 2)
254+
0
255+
>>> _compute_depth_from_indent(4, 2)
256+
2
257+
>>> _compute_depth_from_indent(3, 2) # Lenient mode
258+
1
259+
"""
260+
return indent_spaces // indent_size

0 commit comments

Comments
 (0)