From 665b70b5f396029510f62e5faa6c6a65d350f0e5 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 09:59:46 -0400 Subject: [PATCH 1/9] initial implementation --- .github/workflows/tests.yml | 2 +- pyproject.toml | 1 + tests/test_blob_parsing_fuzz.py | 190 ++++++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 tests/test_blob_parsing_fuzz.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b040985..2d1ae62 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -105,7 +105,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -r requirements-lock.txt - python -m pip install 'pytest>=8,<9' + python -m pip install 'pytest>=8,<9' 'hypothesis>=6.100,<7' - name: Run unittest suite run: python -m unittest discover tests -v diff --git a/pyproject.toml b/pyproject.toml index 2c4226b..ea79f67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ desktop = ["pywebview>=5.0,<6"] dev = [ "pytest>=8,<9", "mypy>=1.10,<2", + "hypothesis>=6.100,<7", ] [project.scripts] diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py new file mode 100644 index 0000000..8df056c --- /dev/null +++ b/tests/test_blob_parsing_fuzz.py @@ -0,0 +1,190 @@ +"""Property-based fuzz tests for blob / bubble parsing (issue #71). + +Run: + python -m unittest tests.test_blob_parsing_fuzz -v + python -m pytest tests/test_blob_parsing_fuzz.py -v +""" + +from __future__ import annotations + +import json +import os +import sys +import unittest + +from hypothesis import given, settings +from hypothesis import strategies as st + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from models import Bubble, SchemaError +from utils.cli_chat_reader import _extract_blob_refs, messages_to_bubbles +from utils.text_extract import extract_text_from_bubble + +# Bounded strategies: fast enough for CI (<30s total with default example counts). +_JSON_VALUES = st.one_of( + st.none(), + st.booleans(), + st.integers(), + st.floats(allow_nan=False, allow_infinity=False), + st.text(max_size=200), + st.lists(st.text(max_size=80), max_size=8), +) + +_BUBBLE_RAW = st.dictionaries( + st.text(min_size=0, max_size=40), + _JSON_VALUES, + max_size=12, +) + +_BUBBLE_ID = st.text( + alphabet=st.characters(blacklist_categories=("Cs",), blacklist_characters="\x00"), + min_size=1, + max_size=80, +) + +@st.composite +def _cli_message(draw) -> dict: + role = draw(st.sampled_from(["user", "assistant", "system", "tool", ""])) + content = draw( + st.one_of( + st.text(max_size=500), + st.lists( + st.dictionaries( + st.sampled_from( + ["type", "text", "toolName", "args", "toolCallId", "result"] + ), + st.one_of(st.text(max_size=120), st.integers(), st.none()), + max_size=6, + ), + max_size=8, + ), + st.none(), + ) + ) + return {"role": role, "content": content} + +_BUBBLE_LIKE = st.dictionaries( + st.sampled_from(["text", "richText", "codeBlocks", "type", "metadata"]), + st.one_of( + st.text(max_size=300), + st.none(), + st.lists( + st.dictionaries( + st.text(max_size=20), + st.one_of(st.text(max_size=100), st.integers()), + max_size=5, + ), + max_size=4, + ), + st.dictionaries(st.text(max_size=20), _JSON_VALUES, max_size=5), + ), + max_size=6, +) + + +def _classify_blob_bytes(data: bytes) -> None: + """Mirror traverse_blobs blob classification without SQLite.""" + try: + msg = json.loads(data.decode("utf-8")) + if isinstance(msg, dict) and "role" in msg: + return + except (UnicodeDecodeError, json.JSONDecodeError, TypeError): + pass + _extract_blob_refs(data) + + +class TestBubbleFromDictFuzz(unittest.TestCase): + @given(raw=_BUBBLE_RAW, bubble_id=_BUBBLE_ID) + @settings(max_examples=80, deadline=None) + def test_never_raises_unhandled(self, raw: dict, bubble_id: str) -> None: + try: + bubble = Bubble.from_dict(raw, bubble_id=bubble_id) + except SchemaError: + return + except Exception as exc: + self.fail(f"unexpected {type(exc).__name__}: {exc}") + self.assertEqual(bubble.bubble_id, bubble_id) + self.assertIs(bubble.raw, raw) + + @given(raw=_BUBBLE_RAW, bubble_id=_BUBBLE_ID) + @settings(max_examples=80, deadline=None) + def test_parsing_is_idempotent(self, raw: dict, bubble_id: str) -> None: + try: + first = Bubble.from_dict(raw, bubble_id=bubble_id) + second = Bubble.from_dict(raw, bubble_id=bubble_id) + except SchemaError: + return + except Exception as exc: + self.fail(f"unexpected {type(exc).__name__}: {exc}") + self.assertEqual(first, second) + + +class TestBlobChainParsingFuzz(unittest.TestCase): + @given(data=st.binary(max_size=4096)) + @settings(max_examples=120, deadline=None) + def test_extract_blob_refs_never_raises(self, data: bytes) -> None: + try: + refs = _extract_blob_refs(data) + except Exception as exc: + self.fail(f"unexpected {type(exc).__name__}: {exc}") + self.assertIsInstance(refs, list) + for ref in refs: + self.assertIsInstance(ref, str) + self.assertEqual(len(ref), 64) + + @given(data=st.binary(max_size=4096)) + @settings(max_examples=80, deadline=None) + def test_extract_blob_refs_is_idempotent(self, data: bytes) -> None: + self.assertEqual(_extract_blob_refs(data), _extract_blob_refs(data)) + + @given(data=st.binary(max_size=4096)) + @settings(max_examples=80, deadline=None) + def test_blob_classification_never_raises(self, data: bytes) -> None: + try: + _classify_blob_bytes(data) + except Exception as exc: + self.fail(f"unexpected {type(exc).__name__}: {exc}") + + +class TestTextExtractionFuzz(unittest.TestCase): + @given(bubble=_BUBBLE_LIKE) + @settings(max_examples=100, deadline=None) + def test_extract_text_from_bubble_never_raises(self, bubble: dict) -> None: + try: + extract_text_from_bubble(bubble) + except Exception as exc: + self.fail(f"unexpected {type(exc).__name__}: {exc}") + + @given(bubble=_BUBBLE_LIKE) + @settings(max_examples=80, deadline=None) + def test_extract_text_is_idempotent(self, bubble: dict) -> None: + self.assertEqual( + extract_text_from_bubble(bubble), + extract_text_from_bubble(bubble), + ) + + @given( + messages=st.lists(_cli_message(), max_size=12), + created_at=st.integers(min_value=0, max_value=2_000_000_000_000), + ) + @settings(max_examples=80, deadline=None) + def test_messages_to_bubbles_then_extract_never_raises( + self, messages: list[dict], created_at: int + ) -> None: + try: + bubbles = messages_to_bubbles(messages, created_at) + except Exception as exc: + self.fail(f"messages_to_bubbles raised {type(exc).__name__}: {exc}") + self.assertIsInstance(bubbles, list) + for bubble in bubbles: + try: + extract_text_from_bubble(bubble) + except Exception as exc: + self.fail(f"extract_text_from_bubble raised {type(exc).__name__}: {exc}") + + +if __name__ == "__main__": + unittest.main() From a4ab1aff08544c685b194b63aff65e01afd440db Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 10:14:32 -0400 Subject: [PATCH 2/9] improved for issues --- .github/workflows/tests.yml | 1 + .gitignore | 1 + README.md | 6 ++ tests/test_blob_parsing_fuzz.py | 180 ++++++++++++++++++++++++++++---- utils/cli_chat_reader.py | 31 ++++-- utils/text_extract.py | 6 +- 6 files changed, 190 insertions(+), 35 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2d1ae62..2342a44 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -105,6 +105,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -r requirements-lock.txt + # Dev-only (not in requirements-lock.txt): pytest + hypothesis for unittest/property tests. python -m pip install 'pytest>=8,<9' 'hypothesis>=6.100,<7' - name: Run unittest suite diff --git a/.gitignore b/.gitignore index 685a7ae..5fd078f 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,4 @@ Thumbs.db .coverage htmlcov/ coverage.xml +.hypothesis/ diff --git a/README.md b/README.md index 4ca8e78..007802e 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,12 @@ source venv/bin/activate pip install -r requirements.txt ``` +For development (pytest, mypy, Hypothesis property tests): + +```bash +pip install -e ".[dev]" +``` + For reproducible installs (same versions as CI), use the pinned lock file: ```bash diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index 8df056c..ba4af09 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -9,7 +9,9 @@ import json import os +import sqlite3 import sys +import tempfile import unittest from hypothesis import given, settings @@ -20,7 +22,12 @@ sys.path.insert(0, REPO_ROOT) from models import Bubble, SchemaError -from utils.cli_chat_reader import _extract_blob_refs, messages_to_bubbles +from utils.cli_chat_reader import ( + classify_blob_data, + messages_to_bubbles, + traverse_blobs, + _extract_blob_refs, +) from utils.text_extract import extract_text_from_bubble # Bounded strategies: fast enough for CI (<30s total with default example counts). @@ -39,14 +46,38 @@ max_size=12, ) +_BUBBLE_RAW_ANY = st.one_of( + _BUBBLE_RAW, + st.none(), + st.integers(), + st.lists(st.text(max_size=40), max_size=5), + st.text(max_size=200), +) + _BUBBLE_ID = st.text( alphabet=st.characters(blacklist_categories=("Cs",), blacklist_characters="\x00"), min_size=1, max_size=80, ) +_BUBBLE_ID_ANY = st.one_of( + _BUBBLE_ID, + st.just(""), + st.none(), + st.integers(min_value=0, max_value=9999), + st.binary(min_size=0, max_size=8), +) + +_BLOB_ID_HEX = st.text( + alphabet="abcdef0123456789", + min_size=64, + max_size=64, +) + + @st.composite def _cli_message(draw) -> dict: + # Empty role is intentional adversarial input (unknown / missing role). role = draw(st.sampled_from(["user", "assistant", "system", "tool", ""])) content = draw( st.one_of( @@ -66,6 +97,7 @@ def _cli_message(draw) -> dict: ) return {"role": role, "content": content} + _BUBBLE_LIKE = st.dictionaries( st.sampled_from(["text", "richText", "codeBlocks", "type", "metadata"]), st.one_of( @@ -84,42 +116,101 @@ def _cli_message(draw) -> dict: max_size=6, ) +_KV_VALUE = st.one_of( + st.none(), + _BUBBLE_RAW, + st.text(max_size=400), + st.binary(max_size=256), + st.integers(), +) + + +def _make_meta_value(meta: dict) -> str: + return json.dumps(meta).encode("utf-8").hex() + -def _classify_blob_bytes(data: bytes) -> None: - """Mirror traverse_blobs blob classification without SQLite.""" +def _build_store_db_raw(path: str, meta: dict, blobs: dict[str, bytes]) -> None: + """Minimal store.db with arbitrary blob payloads (for traverse_blobs fuzz).""" + conn = sqlite3.connect(path) + conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)") + conn.execute("CREATE TABLE blobs (id TEXT PRIMARY KEY, data BLOB)") + conn.execute("INSERT INTO meta VALUES ('0', ?)", (_make_meta_value(meta),)) + for blob_id, data in blobs.items(): + conn.execute("INSERT INTO blobs VALUES (?, ?)", (blob_id, data)) + conn.commit() + conn.close() + + +def _assemble_workspace_bubble(bubble_id: object, value: object) -> dict | None: + """Mirror workspace_tabs KV bubble load (json.loads → Bubble.from_dict).""" try: - msg = json.loads(data.decode("utf-8")) - if isinstance(msg, dict) and "role" in msg: - return - except (UnicodeDecodeError, json.JSONDecodeError, TypeError): - pass - _extract_blob_refs(data) + if value is None: + return None + if isinstance(value, (bytes, bytearray)): + parsed = json.loads(bytes(value).decode("utf-8")) + elif isinstance(value, str): + parsed = json.loads(value) + else: + parsed = value + except (json.JSONDecodeError, TypeError, ValueError, UnicodeDecodeError): + return None + try: + if not isinstance(bubble_id, str): + Bubble.from_dict(parsed, bubble_id=bubble_id) # type: ignore[arg-type] + return None + return Bubble.from_dict(parsed, bubble_id=bubble_id).raw + except SchemaError: + return None + + +def _parse_bubble_from_dict(raw: object, bubble_id: object) -> Bubble | None: + """Call Bubble.from_dict; return None on SchemaError, propagate nothing else.""" + try: + return Bubble.from_dict(raw, bubble_id=bubble_id) # type: ignore[arg-type] + except SchemaError: + return None class TestBubbleFromDictFuzz(unittest.TestCase): @given(raw=_BUBBLE_RAW, bubble_id=_BUBBLE_ID) @settings(max_examples=80, deadline=None) def test_never_raises_unhandled(self, raw: dict, bubble_id: str) -> None: - try: - bubble = Bubble.from_dict(raw, bubble_id=bubble_id) - except SchemaError: + bubble = _parse_bubble_from_dict(raw, bubble_id) + if bubble is None: return - except Exception as exc: - self.fail(f"unexpected {type(exc).__name__}: {exc}") self.assertEqual(bubble.bubble_id, bubble_id) self.assertIs(bubble.raw, raw) + @given(raw=_BUBBLE_RAW_ANY, bubble_id=_BUBBLE_ID_ANY) + @settings(max_examples=80, deadline=None) + def test_adversarial_inputs_only_schema_error_or_success( + self, raw: object, bubble_id: object + ) -> None: + try: + _parse_bubble_from_dict(raw, bubble_id) + except Exception as exc: + self.fail(f"unexpected {type(exc).__name__}: {exc}") + @given(raw=_BUBBLE_RAW, bubble_id=_BUBBLE_ID) @settings(max_examples=80, deadline=None) def test_parsing_is_idempotent(self, raw: dict, bubble_id: str) -> None: + first = _parse_bubble_from_dict(raw, bubble_id) + second = _parse_bubble_from_dict(raw, bubble_id) + self.assertEqual(first, second) + + +class TestWorkspaceTabsAssemblyFuzz(unittest.TestCase): + @given(bubble_id=_BUBBLE_ID_ANY, value=_KV_VALUE) + @settings(max_examples=100, deadline=None) + def test_assemble_workspace_bubble_never_raises( + self, bubble_id: object, value: object + ) -> None: try: - first = Bubble.from_dict(raw, bubble_id=bubble_id) - second = Bubble.from_dict(raw, bubble_id=bubble_id) - except SchemaError: - return + result = _assemble_workspace_bubble(bubble_id, value) except Exception as exc: self.fail(f"unexpected {type(exc).__name__}: {exc}") - self.assertEqual(first, second) + if result is not None: + self.assertIsInstance(result, dict) class TestBlobChainParsingFuzz(unittest.TestCase): @@ -142,11 +233,39 @@ def test_extract_blob_refs_is_idempotent(self, data: bytes) -> None: @given(data=st.binary(max_size=4096)) @settings(max_examples=80, deadline=None) - def test_blob_classification_never_raises(self, data: bytes) -> None: + def test_classify_blob_data_never_raises(self, data: bytes) -> None: try: - _classify_blob_bytes(data) + msg, refs = classify_blob_data(data) except Exception as exc: self.fail(f"unexpected {type(exc).__name__}: {exc}") + if msg is not None: + self.assertIsInstance(msg, dict) + self.assertEqual(refs, []) + else: + self.assertIsInstance(refs, list) + + @given( + root_id=_BLOB_ID_HEX, + extra_ids=st.lists(_BLOB_ID_HEX, max_size=6, unique=True), + payloads=st.lists(st.binary(max_size=1024), min_size=1, max_size=8), + ) + @settings(max_examples=40, deadline=None) + def test_traverse_blobs_never_raises( + self, root_id: str, extra_ids: list[str], payloads: list[bytes] + ) -> None: + meta = {"latestRootBlobId": root_id, "createdAt": 1_700_000_000_000} + blobs: dict[str, bytes] = {root_id: payloads[0]} + for i, bid in enumerate(extra_ids): + if bid not in blobs: + blobs[bid] = payloads[(i + 1) % len(payloads)] + with tempfile.TemporaryDirectory() as td: + db_path = os.path.join(td, "store.db") + _build_store_db_raw(db_path, meta, blobs) + try: + messages = traverse_blobs(db_path) + except Exception as exc: + self.fail(f"traverse_blobs raised {type(exc).__name__}: {exc}") + self.assertIsInstance(messages, list) class TestTextExtractionFuzz(unittest.TestCase): @@ -154,9 +273,10 @@ class TestTextExtractionFuzz(unittest.TestCase): @settings(max_examples=100, deadline=None) def test_extract_text_from_bubble_never_raises(self, bubble: dict) -> None: try: - extract_text_from_bubble(bubble) + text = extract_text_from_bubble(bubble) except Exception as exc: self.fail(f"unexpected {type(exc).__name__}: {exc}") + self.assertIsInstance(text, str) @given(bubble=_BUBBLE_LIKE) @settings(max_examples=80, deadline=None) @@ -181,9 +301,23 @@ def test_messages_to_bubbles_then_extract_never_raises( self.assertIsInstance(bubbles, list) for bubble in bubbles: try: - extract_text_from_bubble(bubble) + text = extract_text_from_bubble(bubble) except Exception as exc: self.fail(f"extract_text_from_bubble raised {type(exc).__name__}: {exc}") + self.assertIsInstance(text, str) + + @given( + messages=st.lists(_cli_message(), max_size=12), + created_at=st.integers(min_value=0, max_value=2_000_000_000_000), + ) + @settings(max_examples=80, deadline=None) + def test_messages_to_bubbles_is_idempotent( + self, messages: list[dict], created_at: int + ) -> None: + self.assertEqual( + messages_to_bubbles(messages, created_at), + messages_to_bubbles(messages, created_at), + ) if __name__ == "__main__": diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py index 14dbd0c..f0f1a4f 100644 --- a/utils/cli_chat_reader.py +++ b/utils/cli_chat_reader.py @@ -79,6 +79,23 @@ def _extract_blob_refs(data: bytes) -> list[str]: return refs +def classify_blob_data(data: bytes) -> tuple[dict | None, list[str]]: + """Classify a blob payload as a JSON message or a binary chain node. + + Returns ``(message_dict, [])`` when *data* decodes to a dict with a + ``role`` field; otherwise ``(None, refs)`` where *refs* are SHA-256 hex + ids from :func:`_extract_blob_refs`. Used by :func:`traverse_blobs` and + property tests — keep in sync when the load loop changes. + """ + try: + msg = json.loads(data.decode("utf-8")) + if isinstance(msg, dict) and "role" in msg: + return msg, [] + except (UnicodeDecodeError, json.JSONDecodeError, TypeError): + pass + return None, _extract_blob_refs(data) + + def traverse_blobs(db_path: str) -> list[dict]: """Reconstruct the conversation from a ``store.db`` blob graph. @@ -118,15 +135,11 @@ def traverse_blobs(db_path: str) -> list[dict]: for blob_id, data in conn.execute("SELECT id, data FROM blobs"): if not isinstance(data, bytes): continue - try: - msg = json.loads(data.decode("utf-8")) - if isinstance(msg, dict) and "role" in msg: - json_blobs[blob_id] = msg - continue - except (UnicodeDecodeError, json.JSONDecodeError): - pass - refs = _extract_blob_refs(data) - chain_blobs[blob_id] = refs + msg, refs = classify_blob_data(data) + if msg is not None: + json_blobs[blob_id] = msg + else: + chain_blobs[blob_id] = refs # BFS from root (newest-first by nature of the linked-list structure); # reverse at the end to restore chronological (oldest→newest) order. diff --git a/utils/text_extract.py b/utils/text_extract.py index d0b179c..644ec10 100644 --- a/utils/text_extract.py +++ b/utils/text_extract.py @@ -28,9 +28,9 @@ def extract_text_from_bubble(bubble: dict) -> str: text = "" - # Try text field first + # Try text field first (coerce non-str values — Cursor payloads can drift) if bubble.get("text") and str(bubble["text"]).strip(): - text = bubble["text"] + text = str(bubble["text"]) # Fall back to richText if not text and bubble.get("richText"): @@ -49,7 +49,7 @@ def extract_text_from_bubble(bubble: dict) -> str: lang = cb.get("language", "") text += f"\n\n```{lang}\n{cb['content']}\n```" - return text + return text if isinstance(text, str) else "" def slug(s: str) -> str: From 8d0f2976f8471864c9ee61de560f77264ada39aa Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 10:41:20 -0400 Subject: [PATCH 3/9] =?UTF-8?q?fix:=20Remove=20dead=20return=20None=20in?= =?UTF-8?q?=20=5Fassemble=5Fworkspace=5Fbubble=20when=20bubble=5Fid=20isn?= =?UTF-8?q?=E2=80=99t=20a=20str.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_blob_parsing_fuzz.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index ba4af09..6852d15 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -155,10 +155,7 @@ def _assemble_workspace_bubble(bubble_id: object, value: object) -> dict | None: except (json.JSONDecodeError, TypeError, ValueError, UnicodeDecodeError): return None try: - if not isinstance(bubble_id, str): - Bubble.from_dict(parsed, bubble_id=bubble_id) # type: ignore[arg-type] - return None - return Bubble.from_dict(parsed, bubble_id=bubble_id).raw + return Bubble.from_dict(parsed, bubble_id=bubble_id).raw # type: ignore[arg-type] except SchemaError: return None From 24a144bc90f98d75f6afeed86a7dbfc03ec2cd2d Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 10:52:59 -0400 Subject: [PATCH 4/9] fix: resolve findings from coderabbitai's review report --- tests/test_blob_parsing_fuzz.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index 6852d15..1ab780c 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -14,7 +14,7 @@ import tempfile import unittest -from hypothesis import given, settings +from hypothesis import HealthCheck, given, settings from hypothesis import strategies as st REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -26,7 +26,7 @@ classify_blob_data, messages_to_bubbles, traverse_blobs, - _extract_blob_refs, + _extract_blob_refs, # internal helper; covered directly alongside classify_blob_data ) from utils.text_extract import extract_text_from_bubble @@ -76,7 +76,7 @@ @st.composite -def _cli_message(draw) -> dict: +def _cli_message(draw): # Empty role is intentional adversarial input (unknown / missing role). role = draw(st.sampled_from(["user", "assistant", "system", "tool", ""])) content = draw( @@ -142,7 +142,12 @@ def _build_store_db_raw(path: str, meta: dict, blobs: dict[str, bytes]) -> None: def _assemble_workspace_bubble(bubble_id: object, value: object) -> dict | None: - """Mirror workspace_tabs KV bubble load (json.loads → Bubble.from_dict).""" + """Mirror workspace_tabs KV bubble load (json.loads → Bubble.from_dict). + + Intentionally re-implements the conversion instead of importing + ``_loads_kv_value_logged`` (logging / payload hashing side effects). + Keep in sync with the bubbleId load loop in ``services/workspace_tabs.py``. + """ try: if value is None: return None @@ -246,10 +251,15 @@ def test_classify_blob_data_never_raises(self, data: bytes) -> None: extra_ids=st.lists(_BLOB_ID_HEX, max_size=6, unique=True), payloads=st.lists(st.binary(max_size=1024), min_size=1, max_size=8), ) - @settings(max_examples=40, deadline=None) + @settings( + max_examples=40, + deadline=None, + suppress_health_check=[HealthCheck.too_slow], + ) def test_traverse_blobs_never_raises( self, root_id: str, extra_ids: list[str], payloads: list[bytes] ) -> None: + # CliSessionMeta only requires latestRootBlobId (str); BFS runs after meta parse. meta = {"latestRootBlobId": root_id, "createdAt": 1_700_000_000_000} blobs: dict[str, bytes] = {root_id: payloads[0]} for i, bid in enumerate(extra_ids): From b2352c8150b6ff04d022a2b71e4143ecafdfc112 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 12:22:34 -0400 Subject: [PATCH 5/9] fix: _assemble_workspace_bubble doesn't truly mirror production. --- tests/test_blob_parsing_fuzz.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index 1ab780c..87261f3 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -144,20 +144,17 @@ def _build_store_db_raw(path: str, meta: dict, blobs: dict[str, bytes]) -> None: def _assemble_workspace_bubble(bubble_id: object, value: object) -> dict | None: """Mirror workspace_tabs KV bubble load (json.loads → Bubble.from_dict). - Intentionally re-implements the conversion instead of importing - ``_loads_kv_value_logged`` (logging / payload hashing side effects). - Keep in sync with the bubbleId load loop in ``services/workspace_tabs.py``. + Matches ``services/workspace_tabs.py`` (bubbleId loop): ``json.loads(row["value"])`` + with no type branching — same exceptions as production. Rows with ``value IS NULL`` + are not selected in production; ``None`` here returns ``None`` for fuzz only. + + Intentionally omits ``_loads_kv_value_logged`` (logging / payload hashing). """ + if value is None: + return None try: - if value is None: - return None - if isinstance(value, (bytes, bytearray)): - parsed = json.loads(bytes(value).decode("utf-8")) - elif isinstance(value, str): - parsed = json.loads(value) - else: - parsed = value - except (json.JSONDecodeError, TypeError, ValueError, UnicodeDecodeError): + parsed = json.loads(value) # type: ignore[arg-type] + except (json.JSONDecodeError, TypeError, ValueError): return None try: return Bubble.from_dict(parsed, bubble_id=bubble_id).raw # type: ignore[arg-type] From 49734fe39b0a26ce68ee7b2a58bfdb4da839766b Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 12:26:27 -0400 Subject: [PATCH 6/9] fix: classify_blob_data widens the except list unnecessarily --- utils/cli_chat_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py index f0f1a4f..1864af7 100644 --- a/utils/cli_chat_reader.py +++ b/utils/cli_chat_reader.py @@ -91,7 +91,7 @@ def classify_blob_data(data: bytes) -> tuple[dict | None, list[str]]: msg = json.loads(data.decode("utf-8")) if isinstance(msg, dict) and "role" in msg: return msg, [] - except (UnicodeDecodeError, json.JSONDecodeError, TypeError): + except (UnicodeDecodeError, json.JSONDecodeError): pass return None, _extract_blob_refs(data) From 62902133f247dcad7b7ac95055816f2239b76325 Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 12:31:24 -0400 Subject: [PATCH 7/9] fix: _BUBBLE_LIKE doesn't directly cover the bug text_extract.py fixes --- tests/test_blob_parsing_fuzz.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index 87261f3..d148d6b 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -102,6 +102,8 @@ def _cli_message(draw): st.sampled_from(["text", "richText", "codeBlocks", "type", "metadata"]), st.one_of( st.text(max_size=300), + st.integers(), + st.booleans(), st.none(), st.lists( st.dictionaries( From 0289d3e7252e23fb46cb0a022260e45b0c4cdc8f Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 12:39:12 -0400 Subject: [PATCH 8/9] fix: test_traverse_blobs_never_raises skips the meta-parse error path --- tests/test_blob_parsing_fuzz.py | 49 +++++++++++++++++++++++++++++---- tests/test_cli_chat_reader.py | 14 +++++----- tests/test_models.py | 8 +++--- utils/cli_chat_reader.py | 6 ++-- 4 files changed, 58 insertions(+), 19 deletions(-) diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py index d148d6b..36fd753 100644 --- a/tests/test_blob_parsing_fuzz.py +++ b/tests/test_blob_parsing_fuzz.py @@ -24,9 +24,9 @@ from models import Bubble, SchemaError from utils.cli_chat_reader import ( classify_blob_data, + extract_blob_refs, messages_to_bubbles, traverse_blobs, - _extract_blob_refs, # internal helper; covered directly alongside classify_blob_data ) from utils.text_extract import extract_text_from_bubble @@ -132,17 +132,39 @@ def _make_meta_value(meta: dict) -> str: def _build_store_db_raw(path: str, meta: dict, blobs: dict[str, bytes]) -> None: - """Minimal store.db with arbitrary blob payloads (for traverse_blobs fuzz).""" + """Minimal store.db with well-formed meta dict and arbitrary blob payloads.""" + _build_store_db_meta_row(path, _make_meta_value(meta), blobs) + + +def _build_store_db_meta_row( + path: str, meta_row: str | None, blobs: dict[str, bytes] +) -> None: + """Minimal store.db; *meta_row* is the raw ``meta.value`` (hex JSON or adversarial).""" conn = sqlite3.connect(path) conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)") conn.execute("CREATE TABLE blobs (id TEXT PRIMARY KEY, data BLOB)") - conn.execute("INSERT INTO meta VALUES ('0', ?)", (_make_meta_value(meta),)) + if meta_row is not None: + conn.execute("INSERT INTO meta VALUES ('0', ?)", (meta_row,)) for blob_id, data in blobs.items(): conn.execute("INSERT INTO blobs VALUES (?, ?)", (blob_id, data)) conn.commit() conn.close() +_FUZZ_META_ROW = st.one_of( + st.none(), + st.just(""), + st.text(min_size=0, max_size=200), + st.dictionaries(st.text(max_size=20), _JSON_VALUES, max_size=6).map( + lambda d: json.dumps(d).encode("utf-8").hex() + ), + st.builds( + lambda root: _make_meta_value({"latestRootBlobId": root, "createdAt": 1}), + _BLOB_ID_HEX, + ), +) + + def _assemble_workspace_bubble(bubble_id: object, value: object) -> dict | None: """Mirror workspace_tabs KV bubble load (json.loads → Bubble.from_dict). @@ -219,7 +241,7 @@ class TestBlobChainParsingFuzz(unittest.TestCase): @settings(max_examples=120, deadline=None) def test_extract_blob_refs_never_raises(self, data: bytes) -> None: try: - refs = _extract_blob_refs(data) + refs = extract_blob_refs(data) except Exception as exc: self.fail(f"unexpected {type(exc).__name__}: {exc}") self.assertIsInstance(refs, list) @@ -230,7 +252,7 @@ def test_extract_blob_refs_never_raises(self, data: bytes) -> None: @given(data=st.binary(max_size=4096)) @settings(max_examples=80, deadline=None) def test_extract_blob_refs_is_idempotent(self, data: bytes) -> None: - self.assertEqual(_extract_blob_refs(data), _extract_blob_refs(data)) + self.assertEqual(extract_blob_refs(data), extract_blob_refs(data)) @given(data=st.binary(max_size=4096)) @settings(max_examples=80, deadline=None) @@ -273,6 +295,23 @@ def test_traverse_blobs_never_raises( self.fail(f"traverse_blobs raised {type(exc).__name__}: {exc}") self.assertIsInstance(messages, list) + @given(meta_row=_FUZZ_META_ROW) + @settings( + max_examples=30, + deadline=None, + suppress_health_check=[HealthCheck.too_slow], + ) + def test_traverse_blobs_meta_parse_never_raises(self, meta_row: str | None) -> None: + """Covers meta decode / CliSessionMeta.from_dict failure → return [] (no crash).""" + with tempfile.TemporaryDirectory() as td: + db_path = os.path.join(td, "store.db") + _build_store_db_meta_row(db_path, meta_row, {}) + try: + messages = traverse_blobs(db_path) + except Exception as exc: + self.fail(f"traverse_blobs raised {type(exc).__name__}: {exc}") + self.assertIsInstance(messages, list) + class TestTextExtractionFuzz(unittest.TestCase): @given(bubble=_BUBBLE_LIKE) diff --git a/tests/test_cli_chat_reader.py b/tests/test_cli_chat_reader.py index afc182c..ce07d42 100644 --- a/tests/test_cli_chat_reader.py +++ b/tests/test_cli_chat_reader.py @@ -20,7 +20,7 @@ from utils.cli_chat_reader import ( _content_to_text, - _extract_blob_refs, + extract_blob_refs, _extract_tool_calls, _strip_user_info, aggregate_session_stats, @@ -75,34 +75,34 @@ def _build_store_db(path: str, meta: dict, json_blobs: dict[str, dict], chain: d # --------------------------------------------------------------------------- -# _extract_blob_refs +# extract_blob_refs # --------------------------------------------------------------------------- class TestExtractBlobRefs(unittest.TestCase): def test_empty_bytes_returns_empty(self): - self.assertEqual(_extract_blob_refs(b""), []) + self.assertEqual(extract_blob_refs(b""), []) def test_single_ref(self): ref = "a" * 64 # 32 bytes as hex raw = b"\x0a\x20" + bytes.fromhex(ref) - self.assertEqual(_extract_blob_refs(raw), [ref]) + self.assertEqual(extract_blob_refs(raw), [ref]) def test_two_refs(self): ref1 = "a" * 64 ref2 = "b" * 64 raw = b"\x0a\x20" + bytes.fromhex(ref1) + b"\x0a\x20" + bytes.fromhex(ref2) - self.assertEqual(_extract_blob_refs(raw), [ref1, ref2]) + self.assertEqual(extract_blob_refs(raw), [ref1, ref2]) def test_noise_bytes_ignored(self): ref = "c" * 64 noise = b"\x00\xff\x01\x02\x03\x04" raw = noise + b"\x0a\x20" + bytes.fromhex(ref) + b"\xde\xad" - self.assertIn(ref, _extract_blob_refs(raw)) + self.assertIn(ref, extract_blob_refs(raw)) def test_partial_tag_at_end_ignored(self): # Only 0x0a without 0x20 immediately following should not produce a ref. raw = b"\x0a" + b"\x00" * 32 - self.assertEqual(_extract_blob_refs(raw), []) + self.assertEqual(extract_blob_refs(raw), []) # --------------------------------------------------------------------------- diff --git a/tests/test_models.py b/tests/test_models.py index a15a68e..04a8b84 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -17,7 +17,7 @@ Workspace, WorkspaceLocalComposer, ) -from utils.cli_chat_reader import _extract_blob_refs +from utils.cli_chat_reader import extract_blob_refs GOOD_COMPOSER_RAW: dict = { @@ -252,7 +252,7 @@ def test_meta_parses_then_blob_chain_extracts_refs(self) -> None: self.assertEqual(meta.latest_root_blob_id, ref1) chain_blob = _make_blob_chain(ref1, ref2, ref3) - refs = _extract_blob_refs(chain_blob) + refs = extract_blob_refs(chain_blob) self.assertEqual(refs, [ref1, ref2, ref3]) def test_blob_chain_skips_non_marker_bytes(self) -> None: @@ -261,10 +261,10 @@ def test_blob_chain_skips_non_marker_bytes(self) -> None: garbage_after = b"\xff\xfe" raw = garbage_before + bytes([0x0A, 0x20]) + bytes.fromhex(ref) + garbage_after - self.assertEqual(_extract_blob_refs(raw), [ref]) + self.assertEqual(extract_blob_refs(raw), [ref]) def test_blob_chain_empty_returns_empty_list(self) -> None: - self.assertEqual(_extract_blob_refs(b""), []) + self.assertEqual(extract_blob_refs(b""), []) if __name__ == "__main__": diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py index 1864af7..5c744c4 100644 --- a/utils/cli_chat_reader.py +++ b/utils/cli_chat_reader.py @@ -62,7 +62,7 @@ def _read_meta(db_path: str) -> dict: return {} -def _extract_blob_refs(data: bytes) -> list[str]: +def extract_blob_refs(data: bytes) -> list[str]: """Extract all 32-byte (SHA-256) blob references from a binary chain node. The encoding is: tag ``0x0a`` (field 1, length-delimited) followed by @@ -84,7 +84,7 @@ def classify_blob_data(data: bytes) -> tuple[dict | None, list[str]]: Returns ``(message_dict, [])`` when *data* decodes to a dict with a ``role`` field; otherwise ``(None, refs)`` where *refs* are SHA-256 hex - ids from :func:`_extract_blob_refs`. Used by :func:`traverse_blobs` and + ids from :func:`extract_blob_refs`. Used by :func:`traverse_blobs` and property tests — keep in sync when the load loop changes. """ try: @@ -93,7 +93,7 @@ def classify_blob_data(data: bytes) -> tuple[dict | None, list[str]]: return msg, [] except (UnicodeDecodeError, json.JSONDecodeError): pass - return None, _extract_blob_refs(data) + return None, extract_blob_refs(data) def traverse_blobs(db_path: str) -> list[dict]: From ca3325922940f383457b10940ba3bfc4618e992c Mon Sep 17 00:00:00 2001 From: bradjin8 Date: Wed, 27 May 2026 12:46:23 -0400 Subject: [PATCH 9/9] fix: comment slightly misleading --- .github/workflows/tests.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2342a44..924c3b7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -98,14 +98,13 @@ jobs: - name: Install runtime + test dependencies # Install from the pinned lock file for deterministic dependency - # resolution (closes #47). pytest is added on top — it is not in - # requirements-lock.txt because it is a dev-only dep. pywebview is + # resolution (closes #47). pytest and hypothesis are added on top — not in + # requirements-lock.txt (dev-only). pywebview is # the desktop-launcher dep and pulls GTK / Qt system libraries on # Linux — intentionally excluded from the CI unittest matrix. run: | python -m pip install --upgrade pip python -m pip install -r requirements-lock.txt - # Dev-only (not in requirements-lock.txt): pytest + hypothesis for unittest/property tests. python -m pip install 'pytest>=8,<9' 'hypothesis>=6.100,<7' - name: Run unittest suite