Skip to content

Commit b3451f3

Browse files
committed
implemented tokenizer for parser
1 parent b3b76af commit b3451f3

9 files changed

Lines changed: 248 additions & 0 deletions

File tree

File renamed without changes.
File renamed without changes.
File renamed without changes.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from .tokenizer import Tokenizer
2+
from .token import Token, TokenType
3+
from .character_stream import CharacterStream
4+
from .token_matcher import TokenMatcher, SingleCharTokenMatcher, AtomTokenMatcher, EOFTokenMatcher
5+
6+
__all__ = [
7+
"Tokenizer",
8+
"Token",
9+
"TokenType",
10+
"CharacterStream",
11+
"TokenMatcher",
12+
"SingleCharTokenMatcher",
13+
"AtomTokenMatcher",
14+
"EOFTokenMatcher",
15+
]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from typing import Optional
2+
3+
4+
class CharacterStream:
5+
def __init__(self, text: str):
6+
self._text = text
7+
self._position = 0
8+
9+
@property
10+
def current_char(self) -> Optional[str]:
11+
if self._position >= len(self._text):
12+
return None
13+
return self._text[self._position]
14+
15+
@property
16+
def position(self) -> int:
17+
return self._position
18+
19+
def advance(self):
20+
if self._position < len(self._text):
21+
self._position += 1
22+
23+
def peek(self, offset: int = 1) -> Optional[str]:
24+
peek_pos = self._position + offset
25+
if peek_pos >= len(self._text):
26+
return None
27+
return self._text[peek_pos]
28+
29+
def is_eof(self) -> bool:
30+
return self._position >= len(self._text)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from enum import Enum
2+
3+
4+
class TokenType(Enum):
5+
ATOM = "ATOM"
6+
TRUTH = "TRUTH"
7+
FALSITY = "FALSITY"
8+
NEGATION = "NEGATION"
9+
CONJUNCTION = "CONJUNCTION"
10+
DISJUNCTION = "DISJUNCTION"
11+
IMPLICATION = "IMPLICATION"
12+
BICONDITIONAL = "BICONDITIONAL"
13+
LEFT_PAREN = "LEFT_PAREN"
14+
RIGHT_PAREN = "RIGHT_PAREN"
15+
EOF = "EOF"
16+
17+
18+
class Token:
19+
def __init__(self, token_type: TokenType, value: str, position: int):
20+
self.type = token_type
21+
self.value = value
22+
self.position = position
23+
24+
def __repr__(self) -> str:
25+
return f"Token({self.type.name}, '{self.value}', {self.position})"
26+
27+
def __eq__(self, other) -> bool:
28+
if not isinstance(other, Token):
29+
return False
30+
return self.type == other.type and self.value == other.value
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Optional
3+
from .character_stream import CharacterStream
4+
from .token import Token, TokenType
5+
6+
7+
class TokenMatcher(ABC):
8+
@abstractmethod
9+
def matches(self, stream: CharacterStream) -> bool:
10+
pass
11+
12+
@abstractmethod
13+
def create_token(self, stream: CharacterStream) -> Token:
14+
pass
15+
16+
17+
class SingleCharTokenMatcher(TokenMatcher):
18+
def __init__(self, char: str, token_type: TokenType):
19+
self._char = char
20+
self._token_type = token_type
21+
22+
def matches(self, stream: CharacterStream) -> bool:
23+
return stream.current_char == self._char
24+
25+
def create_token(self, stream: CharacterStream) -> Token:
26+
position = stream.position
27+
stream.advance()
28+
return Token(self._token_type, self._char, position)
29+
30+
31+
class AtomTokenMatcher(TokenMatcher):
32+
def __init__(self, allowed_chars: str = "'"):
33+
self._allowed_chars = allowed_chars
34+
35+
def matches(self, stream: CharacterStream) -> bool:
36+
char = stream.current_char
37+
return char is not None and (char.isalnum() or char in self._allowed_chars)
38+
39+
def create_token(self, stream: CharacterStream) -> Token:
40+
position = stream.position
41+
atom_name = ""
42+
43+
while stream.current_char is not None:
44+
char = stream.current_char
45+
if char.isalnum() or char in self._allowed_chars:
46+
atom_name += char
47+
stream.advance()
48+
else:
49+
break
50+
51+
if not atom_name:
52+
raise ValueError(f"Empty atom at position {position}")
53+
54+
return Token(TokenType.ATOM, atom_name, position)
55+
56+
57+
class EOFTokenMatcher(TokenMatcher):
58+
def matches(self, stream: CharacterStream) -> bool:
59+
return stream.is_eof()
60+
61+
def create_token(self, stream: CharacterStream) -> Token:
62+
return Token(TokenType.EOF, "", stream.position)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from typing import List, Optional
2+
from .character_stream import CharacterStream
3+
from .token_matcher import TokenMatcher, SingleCharTokenMatcher, AtomTokenMatcher, EOFTokenMatcher
4+
from .token import Token, TokenType
5+
6+
7+
class Tokenizer:
8+
def __init__(self, text: str, matchers: Optional[List[TokenMatcher]] = None):
9+
self._stream = CharacterStream(text)
10+
self._matchers: List[TokenMatcher] = matchers or self._create_default_matchers()
11+
12+
def _create_default_matchers(self) -> List[TokenMatcher]:
13+
return [
14+
SingleCharTokenMatcher("⊤", TokenType.TRUTH),
15+
SingleCharTokenMatcher("⊥", TokenType.FALSITY),
16+
SingleCharTokenMatcher("¬", TokenType.NEGATION),
17+
SingleCharTokenMatcher("∧", TokenType.CONJUNCTION),
18+
SingleCharTokenMatcher("∨", TokenType.DISJUNCTION),
19+
SingleCharTokenMatcher("→", TokenType.IMPLICATION),
20+
SingleCharTokenMatcher("↔", TokenType.BICONDITIONAL),
21+
SingleCharTokenMatcher("(", TokenType.LEFT_PAREN),
22+
SingleCharTokenMatcher(")", TokenType.RIGHT_PAREN),
23+
AtomTokenMatcher(),
24+
EOFTokenMatcher(),
25+
]
26+
27+
def _skip_whitespace(self):
28+
while self._stream.current_char is not None and self._stream.current_char.isspace():
29+
self._stream.advance()
30+
31+
def next_token(self) -> Token:
32+
self._skip_whitespace()
33+
34+
for matcher in self._matchers:
35+
if matcher.matches(self._stream):
36+
return matcher.create_token(self._stream)
37+
38+
char = self._stream.current_char
39+
position = self._stream.position
40+
raise ValueError(f"Unexpected character '{char}' at position {position}")

examples/tokenizer_example.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from evaluation_function.parsing import Tokenizer, TokenType
2+
3+
4+
def tokenize_string(text: str):
5+
print(f"Input: {text}")
6+
print("-" * 60)
7+
8+
tokenizer = Tokenizer(text)
9+
tokens = []
10+
11+
try:
12+
while True:
13+
token = tokenizer.next_token()
14+
tokens.append(token)
15+
16+
if token.type == TokenType.EOF:
17+
break
18+
19+
print(f" {token}")
20+
except ValueError as e:
21+
print(f" ERROR: {e}")
22+
return None
23+
24+
print(f"\nTotal tokens: {len(tokens) - 1} (excluding EOF)")
25+
print()
26+
return tokens
27+
28+
29+
def main():
30+
print("=" * 60)
31+
print("Tokenizer Examples for Propositional Logic")
32+
print("=" * 60)
33+
print()
34+
35+
test_strings = [
36+
"p",
37+
"p'",
38+
"p''",
39+
"q",
40+
"r",
41+
"⊤",
42+
"⊥",
43+
"¬p",
44+
"p ∧ q",
45+
"p ∨ q",
46+
"p → q",
47+
"p ↔ q",
48+
"p ∧ q ∨ r",
49+
"p → q → r",
50+
"¬p ∧ q",
51+
"p ∧ (q ∨ r)",
52+
"(p → q) → r",
53+
"p ↔ (q ↔ r)",
54+
"¬(p ∧ q)",
55+
"((p → q) ∧ (q → r)) → (p → r)",
56+
"¬(p ∨ q) ↔ (¬p ∧ ¬q)",
57+
"p ∧ q ∧ r",
58+
"p ∨ q ∨ r",
59+
"p ∧ q → r ∨ s",
60+
]
61+
62+
for test_string in test_strings:
63+
tokenize_string(test_string)
64+
65+
print("=" * 60)
66+
print("All examples completed!")
67+
print("=" * 60)
68+
69+
70+
if __name__ == "__main__":
71+
main()

0 commit comments

Comments
 (0)