implemented tokenizer for parser

ashwin6-dev · ashwin6-dev · commit b3451f3ff3b3 · 2026-01-15T17:18:34.000Z
diff --git a/evaluation_function/domain/__init__.py b/evaluation_function/domain/__init__.py
diff --git a/evaluation_function/domain/evaluators.py b/evaluation_function/domain/evaluators.py
diff --git a/evaluation_function/domain/formula.py b/evaluation_function/domain/formula.py
diff --git a/evaluation_function/parsing/__init__.py b/evaluation_function/parsing/__init__.py
@@ -0,0 +1,15 @@
+from .tokenizer import Tokenizer
+from .token import Token, TokenType
+from .character_stream import CharacterStream
+from .token_matcher import TokenMatcher, SingleCharTokenMatcher, AtomTokenMatcher, EOFTokenMatcher
+
+__all__ = [
+    "Tokenizer",
+    "Token",
+    "TokenType",
+    "CharacterStream",
+    "TokenMatcher",
+    "SingleCharTokenMatcher",
+    "AtomTokenMatcher",
+    "EOFTokenMatcher",
+]
diff --git a/evaluation_function/parsing/character_stream.py b/evaluation_function/parsing/character_stream.py
@@ -0,0 +1,30 @@
+from typing import Optional
+
+
+class CharacterStream:
+    def __init__(self, text: str):
+        self._text = text
+        self._position = 0
+
+    @property
+    def current_char(self) -> Optional[str]:
+        if self._position >= len(self._text):
+            return None
+        return self._text[self._position]
+
+    @property
+    def position(self) -> int:
+        return self._position
+
+    def advance(self):
+        if self._position < len(self._text):
+            self._position += 1
+
+    def peek(self, offset: int = 1) -> Optional[str]:
+        peek_pos = self._position + offset
+        if peek_pos >= len(self._text):
+            return None
+        return self._text[peek_pos]
+
+    def is_eof(self) -> bool:
+        return self._position >= len(self._text)
diff --git a/evaluation_function/parsing/token.py b/evaluation_function/parsing/token.py
@@ -0,0 +1,30 @@
+from enum import Enum
+
+
+class TokenType(Enum):
+    ATOM = "ATOM"
+    TRUTH = "TRUTH"
+    FALSITY = "FALSITY"
+    NEGATION = "NEGATION"
+    CONJUNCTION = "CONJUNCTION"
+    DISJUNCTION = "DISJUNCTION"
+    IMPLICATION = "IMPLICATION"
+    BICONDITIONAL = "BICONDITIONAL"
+    LEFT_PAREN = "LEFT_PAREN"
+    RIGHT_PAREN = "RIGHT_PAREN"
+    EOF = "EOF"
+
+
+class Token:
+    def __init__(self, token_type: TokenType, value: str, position: int):
+        self.type = token_type
+        self.value = value
+        self.position = position
+
+    def __repr__(self) -> str:
+        return f"Token({self.type.name}, '{self.value}', {self.position})"
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, Token):
+            return False
+        return self.type == other.type and self.value == other.value
diff --git a/evaluation_function/parsing/token_matcher.py b/evaluation_function/parsing/token_matcher.py
@@ -0,0 +1,62 @@
+from abc import ABC, abstractmethod
+from typing import Optional
+from .character_stream import CharacterStream
+from .token import Token, TokenType
+
+
+class TokenMatcher(ABC):
+    @abstractmethod
+    def matches(self, stream: CharacterStream) -> bool:
+        pass
+
+    @abstractmethod
+    def create_token(self, stream: CharacterStream) -> Token:
+        pass
+
+
+class SingleCharTokenMatcher(TokenMatcher):
+    def __init__(self, char: str, token_type: TokenType):
+        self._char = char
+        self._token_type = token_type
+
+    def matches(self, stream: CharacterStream) -> bool:
+        return stream.current_char == self._char
+
+    def create_token(self, stream: CharacterStream) -> Token:
+        position = stream.position
+        stream.advance()
+        return Token(self._token_type, self._char, position)
+
+
+class AtomTokenMatcher(TokenMatcher):
+    def __init__(self, allowed_chars: str = "'"):
+        self._allowed_chars = allowed_chars
+
+    def matches(self, stream: CharacterStream) -> bool:
+        char = stream.current_char
+        return char is not None and (char.isalnum() or char in self._allowed_chars)
+
+    def create_token(self, stream: CharacterStream) -> Token:
+        position = stream.position
+        atom_name = ""
+        
+        while stream.current_char is not None:
+            char = stream.current_char
+            if char.isalnum() or char in self._allowed_chars:
+                atom_name += char
+                stream.advance()
+            else:
+                break
+        
+        if not atom_name:
+            raise ValueError(f"Empty atom at position {position}")
+        
+        return Token(TokenType.ATOM, atom_name, position)
+
+
+class EOFTokenMatcher(TokenMatcher):
+    def matches(self, stream: CharacterStream) -> bool:
+        return stream.is_eof()
+
+    def create_token(self, stream: CharacterStream) -> Token:
+        return Token(TokenType.EOF, "", stream.position)
diff --git a/evaluation_function/parsing/tokenizer.py b/evaluation_function/parsing/tokenizer.py
@@ -0,0 +1,40 @@
+from typing import List, Optional
+from .character_stream import CharacterStream
+from .token_matcher import TokenMatcher, SingleCharTokenMatcher, AtomTokenMatcher, EOFTokenMatcher
+from .token import Token, TokenType
+
+
+class Tokenizer:
+    def __init__(self, text: str, matchers: Optional[List[TokenMatcher]] = None):
+        self._stream = CharacterStream(text)
+        self._matchers: List[TokenMatcher] = matchers or self._create_default_matchers()
+
+    def _create_default_matchers(self) -> List[TokenMatcher]:
+        return [
+            SingleCharTokenMatcher("⊤", TokenType.TRUTH),
+            SingleCharTokenMatcher("⊥", TokenType.FALSITY),
+            SingleCharTokenMatcher("¬", TokenType.NEGATION),
+            SingleCharTokenMatcher("∧", TokenType.CONJUNCTION),
+            SingleCharTokenMatcher("∨", TokenType.DISJUNCTION),
+            SingleCharTokenMatcher("→", TokenType.IMPLICATION),
+            SingleCharTokenMatcher("↔", TokenType.BICONDITIONAL),
+            SingleCharTokenMatcher("(", TokenType.LEFT_PAREN),
+            SingleCharTokenMatcher(")", TokenType.RIGHT_PAREN),
+            AtomTokenMatcher(),
+            EOFTokenMatcher(),
+        ]
+
+    def _skip_whitespace(self):
+        while self._stream.current_char is not None and self._stream.current_char.isspace():
+            self._stream.advance()
+
+    def next_token(self) -> Token:
+        self._skip_whitespace()
+
+        for matcher in self._matchers:
+            if matcher.matches(self._stream):
+                return matcher.create_token(self._stream)
+
+        char = self._stream.current_char
+        position = self._stream.position
+        raise ValueError(f"Unexpected character '{char}' at position {position}")
diff --git a/examples/tokenizer_example.py b/examples/tokenizer_example.py
@@ -0,0 +1,71 @@
+from evaluation_function.parsing import Tokenizer, TokenType
+
+
+def tokenize_string(text: str):
+    print(f"Input: {text}")
+    print("-" * 60)
+    
+    tokenizer = Tokenizer(text)
+    tokens = []
+    
+    try:
+        while True:
+            token = tokenizer.next_token()
+            tokens.append(token)
+            
+            if token.type == TokenType.EOF:
+                break
+            
+            print(f"  {token}")
+    except ValueError as e:
+        print(f"  ERROR: {e}")
+        return None
+    
+    print(f"\nTotal tokens: {len(tokens) - 1} (excluding EOF)")
+    print()
+    return tokens
+
+
+def main():
+    print("=" * 60)
+    print("Tokenizer Examples for Propositional Logic")
+    print("=" * 60)
+    print()
+    
+    test_strings = [
+        "p",
+        "p'",
+        "p''",
+        "q",
+        "r",
+        "⊤",
+        "⊥",
+        "¬p",
+        "p ∧ q",
+        "p ∨ q",
+        "p → q",
+        "p ↔ q",
+        "p ∧ q ∨ r",
+        "p → q → r",
+        "¬p ∧ q",
+        "p ∧ (q ∨ r)",
+        "(p → q) → r",
+        "p ↔ (q ↔ r)",
+        "¬(p ∧ q)",
+        "((p → q) ∧ (q → r)) → (p → r)",
+        "¬(p ∨ q) ↔ (¬p ∧ ¬q)",
+        "p ∧ q ∧ r",
+        "p ∨ q ∨ r",
+        "p ∧ q → r ∨ s",
+    ]
+    
+    for test_string in test_strings:
+        tokenize_string(test_string)
+    
+    print("=" * 60)
+    print("All examples completed!")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()