toon-format
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/toon_format/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/toon_format/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/toon_format/_literal_utils.py‎
Lines changed: 67 additions & 0 deletions b/‎src/toon_format/_literal_utils.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎src/toon_format/_scanner.py‎
Lines changed: 260 additions & 0 deletions b/‎src/toon_format/_scanner.py‎
Lines changed: 260 additions & 0 deletions
@@ -1,3 +1,6 @@
+# Reference repositories
+!ptoon-reference/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -7,7 +7,9 @@ authors = [
     { name = "Johann Schopplich", email = "hello@johannschopplich.com" }
 ]
 requires-python = ">=3.8"
-dependencies = []
+dependencies = [
+    "typing-extensions>=4.0.0; python_version < '3.10'",
+]
 license = { text = "MIT" }
 keywords = ["toon", "serialization", "llm", "data-format", "token-efficient"]
 classifiers = [
@@ -40,6 +42,7 @@ dev = [
     "pytest-cov>=4.1.0",
     "ruff>=0.8.0",
     "mypy>=1.8.0",
+    "black>=24.8.0",
 ]
 
 [tool.pytest.ini_options]
 
@@ -8,6 +8,7 @@
 from .decoder import ToonDecodeError, decode
 from .encoder import encode
 from .types import DecodeOptions, Delimiter, DelimiterKey, EncodeOptions
+from .utils import compare_formats, count_tokens, estimate_savings
 
 __version__ = "0.1.1"
 __all__ = [
@@ -18,4 +19,7 @@
     "DelimiterKey",
     "EncodeOptions",
     "DecodeOptions",
+    "count_tokens",
+    "estimate_savings",
+    "compare_formats",
 ]
@@ -0,0 +1,67 @@
+"""Utilities for detecting literal token types.
+
+This module provides functions to identify different types of literal
+values in TOON syntax, such as booleans, null, and numeric literals.
+"""
+
+from .constants import FALSE_LITERAL, NULL_LITERAL, TRUE_LITERAL
+
+
+def is_boolean_or_null_literal(token: str) -> bool:
+    """Check if a token is a boolean or null literal (`true`, `false`, `null`).
+
+    Args:
+        token: The token to check
+
+    Returns:
+        True if the token is a boolean or null literal
+
+    Examples:
+        >>> is_boolean_or_null_literal("true")
+        True
+        >>> is_boolean_or_null_literal("null")
+        True
+        >>> is_boolean_or_null_literal("hello")
+        False
+    """
+    return token == TRUE_LITERAL or token == FALSE_LITERAL or token == NULL_LITERAL
+
+
+def is_numeric_literal(token: str) -> bool:
+    """Check if a token represents a valid numeric literal.
+
+    Rejects numbers with leading zeros (except `"0"` itself or decimals like `"0.5"`).
+    Per Section 7.3 of the TOON specification.
+
+    Args:
+        token: The token to check
+
+    Returns:
+        True if the token is a valid numeric literal
+
+    Examples:
+        >>> is_numeric_literal("42")
+        True
+        >>> is_numeric_literal("3.14")
+        True
+        >>> is_numeric_literal("0.5")
+        True
+        >>> is_numeric_literal("0123")  # Leading zero - not valid
+        False
+        >>> is_numeric_literal("hello")
+        False
+    """
+    if not token:
+        return False
+
+    # Must not have leading zeros (except for `"0"` itself or decimals like `"0.5"`)
+    if len(token) > 1 and token[0] == "0" and token[1] != ".":
+        return False
+
+    # Check if it's a valid number
+    try:
+        num = float(token)
+        # Reject NaN and infinity
+        return not (num != num or not (-float("inf") < num < float("inf")))
+    except ValueError:
+        return False
@@ -0,0 +1,260 @@
+"""Scanner for parsing TOON input into lines with depth information.
+
+This module implements the first stage of the TOON decoding pipeline:
+scanning the input text and converting it into structured line objects
+with depth and indentation metadata.
+"""
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+from .constants import SPACE, TAB
+
+
+@dataclass
+class ParsedLine:
+    """A parsed line with metadata.
+
+    Attributes:
+        raw: The original raw line content
+        depth: The indentation depth (number of indent levels)
+        indent: The number of leading spaces
+        content: The line content after removing indentation
+        line_num: The 1-based line number in the source
+    """
+
+    raw: str
+    depth: int
+    indent: int
+    content: str
+    line_num: int
+
+
+@dataclass
+class BlankLineInfo:
+    """Information about a blank line.
+
+    Attributes:
+        line_num: The 1-based line number
+        indent: The number of leading spaces
+        depth: The computed indentation depth
+    """
+
+    line_num: int
+    indent: int
+    depth: int
+
+
+class LineCursor:
+    """Iterator-like class for traversing parsed lines.
+
+    Provides methods to peek at the current line, advance to the next line,
+    and check for lines at specific depths. This abstraction makes the decoder
+    logic cleaner and easier to test.
+    """
+
+    def __init__(
+        self,
+        lines: List[ParsedLine],
+        blank_lines: Optional[List[BlankLineInfo]] = None,
+    ) -> None:
+        """Initialize a line cursor.
+
+        Args:
+            lines: The parsed lines to traverse
+            blank_lines: Optional list of blank line information
+        """
+        self._lines = lines
+        self._index = 0
+        self._blank_lines = blank_lines or []
+
+    def get_blank_lines(self) -> List[BlankLineInfo]:
+        """Get the list of blank lines."""
+        return self._blank_lines
+
+    def peek(self) -> Optional[ParsedLine]:
+        """Peek at the current line without advancing.
+
+        Returns:
+            The current line, or None if at end
+        """
+        if self._index >= len(self._lines):
+            return None
+        return self._lines[self._index]
+
+    def next(self) -> Optional[ParsedLine]:
+        """Get the current line and advance.
+
+        Returns:
+            The current line, or None if at end
+        """
+        if self._index >= len(self._lines):
+            return None
+        line = self._lines[self._index]
+        self._index += 1
+        return line
+
+    def current(self) -> Optional[ParsedLine]:
+        """Get the most recently consumed line.
+
+        Returns:
+            The previous line, or None if no line has been consumed
+        """
+        if self._index > 0:
+            return self._lines[self._index - 1]
+        return None
+
+    def advance(self) -> None:
+        """Advance to the next line."""
+        self._index += 1
+
+    def at_end(self) -> bool:
+        """Check if cursor is at the end of lines.
+
+        Returns:
+            True if at end
+        """
+        return self._index >= len(self._lines)
+
+    @property
+    def length(self) -> int:
+        """Get the total number of lines."""
+        return len(self._lines)
+
+    def peek_at_depth(self, target_depth: int) -> Optional[ParsedLine]:
+        """Peek at the next line at a specific depth.
+
+        Args:
+            target_depth: The target depth
+
+        Returns:
+            The line if it matches the depth, None otherwise
+        """
+        line = self.peek()
+        if not line or line.depth < target_depth:
+            return None
+        if line.depth == target_depth:
+            return line
+        return None
+
+    def has_more_at_depth(self, target_depth: int) -> bool:
+        """Check if there are more lines at a specific depth.
+
+        Args:
+            target_depth: The target depth
+
+        Returns:
+            True if there are more lines at the target depth
+        """
+        return self.peek_at_depth(target_depth) is not None
+
+
+def to_parsed_lines(
+    source: str,
+    indent_size: int,
+    strict: bool,
+) -> Tuple[List[ParsedLine], List[BlankLineInfo]]:
+    """Convert source string to parsed lines with depth information.
+
+    Per Section 12 of the TOON specification for indentation handling.
+    This is the entry point for the scanning stage of the decoder pipeline.
+
+    Args:
+        source: The source string to parse
+        indent_size: The number of spaces per indentation level
+        strict: Whether to enforce strict indentation validation
+
+    Returns:
+        A tuple of (parsed_lines, blank_lines)
+
+    Raises:
+        SyntaxError: If strict mode validation fails (tabs in indentation, invalid spacing)
+
+    Examples:
+        >>> lines, blanks = to_parsed_lines("name: Alice\\n  age: 30", 2, True)
+        >>> lines[0].content
+        'name: Alice'
+        >>> lines[1].depth
+        1
+    """
+    if not source.strip():
+        return [], []
+
+    lines = source.split("\n")
+    parsed: List[ParsedLine] = []
+    blank_lines: List[BlankLineInfo] = []
+
+    for i, raw in enumerate(lines):
+        line_num = i + 1
+        indent = 0
+        while indent < len(raw) and raw[indent] == SPACE:
+            indent += 1
+
+        content = raw[indent:]
+
+        # Track blank lines
+        if not content.strip():
+            depth = _compute_depth_from_indent(indent, indent_size)
+            blank_lines.append(
+                BlankLineInfo(
+                    line_num=line_num,
+                    indent=indent,
+                    depth=depth,
+                )
+            )
+            continue
+
+        depth = _compute_depth_from_indent(indent, indent_size)
+
+        # Strict mode validation
+        if strict:
+            # Find the full leading whitespace region (spaces and tabs)
+            ws_end = 0
+            while ws_end < len(raw) and (raw[ws_end] == SPACE or raw[ws_end] == TAB):
+                ws_end += 1
+
+            # Check for tabs in leading whitespace (before actual content)
+            if TAB in raw[:ws_end]:
+                raise SyntaxError(
+                    f"Line {line_num}: Tabs not allowed in indentation in strict mode"
+                )
+
+            # Check for exact multiples of indent_size
+            if indent > 0 and indent % indent_size != 0:
+                raise SyntaxError(
+                    f"Line {line_num}: Indent must be exact multiple of {indent_size}, "
+                    f"but found {indent} spaces"
+                )
+
+        parsed.append(
+            ParsedLine(
+                raw=raw,
+                indent=indent,
+                content=content,
+                depth=depth,
+                line_num=line_num,
+            )
+        )
+
+    return parsed, blank_lines
+
+
+def _compute_depth_from_indent(indent_spaces: int, indent_size: int) -> int:
+    """Compute depth from indentation spaces.
+
+    Args:
+        indent_spaces: Number of leading spaces
+        indent_size: Number of spaces per indentation level
+
+    Returns:
+        The computed depth
+
+    Examples:
+        >>> _compute_depth_from_indent(0, 2)
+        0
+        >>> _compute_depth_from_indent(4, 2)
+        2
+        >>> _compute_depth_from_indent(3, 2)  # Lenient mode
+        1
+    """
+    return indent_spaces // indent_size