improved for issues

bradjin8 · bradjin8 · commit a4ab1aff0854 · 2026-05-27T10:14:32.000-04:00
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -105,6 +105,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install -r requirements-lock.txt
+          # Dev-only (not in requirements-lock.txt): pytest + hypothesis for unittest/property tests.
           python -m pip install 'pytest>=8,<9' 'hypothesis>=6.100,<7'
 
       - name: Run unittest suite
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,4 @@ Thumbs.db
 .coverage
 htmlcov/
 coverage.xml
+.hypothesis/
diff --git a/README.md b/README.md
@@ -61,6 +61,12 @@ source venv/bin/activate
 pip install -r requirements.txt
 ```
 
+For development (pytest, mypy, Hypothesis property tests):
+
+```bash
+pip install -e ".[dev]"
+```
+
 For reproducible installs (same versions as CI), use the pinned lock file:
 
 ```bash
diff --git a/tests/test_blob_parsing_fuzz.py b/tests/test_blob_parsing_fuzz.py
@@ -9,7 +9,9 @@
 
 import json
 import os
+import sqlite3
 import sys
+import tempfile
 import unittest
 
 from hypothesis import given, settings
@@ -20,7 +22,12 @@
     sys.path.insert(0, REPO_ROOT)
 
 from models import Bubble, SchemaError
-from utils.cli_chat_reader import _extract_blob_refs, messages_to_bubbles
+from utils.cli_chat_reader import (
+    classify_blob_data,
+    messages_to_bubbles,
+    traverse_blobs,
+    _extract_blob_refs,
+)
 from utils.text_extract import extract_text_from_bubble
 
 # Bounded strategies: fast enough for CI (<30s total with default example counts).
@@ -39,14 +46,38 @@
     max_size=12,
 )
 
+_BUBBLE_RAW_ANY = st.one_of(
+    _BUBBLE_RAW,
+    st.none(),
+    st.integers(),
+    st.lists(st.text(max_size=40), max_size=5),
+    st.text(max_size=200),
+)
+
 _BUBBLE_ID = st.text(
     alphabet=st.characters(blacklist_categories=("Cs",), blacklist_characters="\x00"),
     min_size=1,
     max_size=80,
 )
 
+_BUBBLE_ID_ANY = st.one_of(
+    _BUBBLE_ID,
+    st.just(""),
+    st.none(),
+    st.integers(min_value=0, max_value=9999),
+    st.binary(min_size=0, max_size=8),
+)
+
+_BLOB_ID_HEX = st.text(
+    alphabet="abcdef0123456789",
+    min_size=64,
+    max_size=64,
+)
+
+
 @st.composite
 def _cli_message(draw) -> dict:
+    # Empty role is intentional adversarial input (unknown / missing role).
     role = draw(st.sampled_from(["user", "assistant", "system", "tool", ""]))
     content = draw(
         st.one_of(
@@ -66,6 +97,7 @@ def _cli_message(draw) -> dict:
     )
     return {"role": role, "content": content}
 
+
 _BUBBLE_LIKE = st.dictionaries(
     st.sampled_from(["text", "richText", "codeBlocks", "type", "metadata"]),
     st.one_of(
@@ -84,42 +116,101 @@ def _cli_message(draw) -> dict:
     max_size=6,
 )
 
+_KV_VALUE = st.one_of(
+    st.none(),
+    _BUBBLE_RAW,
+    st.text(max_size=400),
+    st.binary(max_size=256),
+    st.integers(),
+)
+
+
+def _make_meta_value(meta: dict) -> str:
+    return json.dumps(meta).encode("utf-8").hex()
+
 
-def _classify_blob_bytes(data: bytes) -> None:
-    """Mirror traverse_blobs blob classification without SQLite."""
+def _build_store_db_raw(path: str, meta: dict, blobs: dict[str, bytes]) -> None:
+    """Minimal store.db with arbitrary blob payloads (for traverse_blobs fuzz)."""
+    conn = sqlite3.connect(path)
+    conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)")
+    conn.execute("CREATE TABLE blobs (id TEXT PRIMARY KEY, data BLOB)")
+    conn.execute("INSERT INTO meta VALUES ('0', ?)", (_make_meta_value(meta),))
+    for blob_id, data in blobs.items():
+        conn.execute("INSERT INTO blobs VALUES (?, ?)", (blob_id, data))
+    conn.commit()
+    conn.close()
+
+
+def _assemble_workspace_bubble(bubble_id: object, value: object) -> dict | None:
+    """Mirror workspace_tabs KV bubble load (json.loads → Bubble.from_dict)."""
     try:
-        msg = json.loads(data.decode("utf-8"))
-        if isinstance(msg, dict) and "role" in msg:
-            return
-    except (UnicodeDecodeError, json.JSONDecodeError, TypeError):
-        pass
-    _extract_blob_refs(data)
+        if value is None:
+            return None
+        if isinstance(value, (bytes, bytearray)):
+            parsed = json.loads(bytes(value).decode("utf-8"))
+        elif isinstance(value, str):
+            parsed = json.loads(value)
+        else:
+            parsed = value
+    except (json.JSONDecodeError, TypeError, ValueError, UnicodeDecodeError):
+        return None
+    try:
+        if not isinstance(bubble_id, str):
+            Bubble.from_dict(parsed, bubble_id=bubble_id)  # type: ignore[arg-type]
+            return None
+        return Bubble.from_dict(parsed, bubble_id=bubble_id).raw
+    except SchemaError:
+        return None
+
+
+def _parse_bubble_from_dict(raw: object, bubble_id: object) -> Bubble | None:
+    """Call Bubble.from_dict; return None on SchemaError, propagate nothing else."""
+    try:
+        return Bubble.from_dict(raw, bubble_id=bubble_id)  # type: ignore[arg-type]
+    except SchemaError:
+        return None
 
 
 class TestBubbleFromDictFuzz(unittest.TestCase):
     @given(raw=_BUBBLE_RAW, bubble_id=_BUBBLE_ID)
     @settings(max_examples=80, deadline=None)
     def test_never_raises_unhandled(self, raw: dict, bubble_id: str) -> None:
-        try:
-            bubble = Bubble.from_dict(raw, bubble_id=bubble_id)
-        except SchemaError:
+        bubble = _parse_bubble_from_dict(raw, bubble_id)
+        if bubble is None:
             return
-        except Exception as exc:
-            self.fail(f"unexpected {type(exc).__name__}: {exc}")
         self.assertEqual(bubble.bubble_id, bubble_id)
         self.assertIs(bubble.raw, raw)
 
+    @given(raw=_BUBBLE_RAW_ANY, bubble_id=_BUBBLE_ID_ANY)
+    @settings(max_examples=80, deadline=None)
+    def test_adversarial_inputs_only_schema_error_or_success(
+        self, raw: object, bubble_id: object
+    ) -> None:
+        try:
+            _parse_bubble_from_dict(raw, bubble_id)
+        except Exception as exc:
+            self.fail(f"unexpected {type(exc).__name__}: {exc}")
+
     @given(raw=_BUBBLE_RAW, bubble_id=_BUBBLE_ID)
     @settings(max_examples=80, deadline=None)
     def test_parsing_is_idempotent(self, raw: dict, bubble_id: str) -> None:
+        first = _parse_bubble_from_dict(raw, bubble_id)
+        second = _parse_bubble_from_dict(raw, bubble_id)
+        self.assertEqual(first, second)
+
+
+class TestWorkspaceTabsAssemblyFuzz(unittest.TestCase):
+    @given(bubble_id=_BUBBLE_ID_ANY, value=_KV_VALUE)
+    @settings(max_examples=100, deadline=None)
+    def test_assemble_workspace_bubble_never_raises(
+        self, bubble_id: object, value: object
+    ) -> None:
         try:
-            first = Bubble.from_dict(raw, bubble_id=bubble_id)
-            second = Bubble.from_dict(raw, bubble_id=bubble_id)
-        except SchemaError:
-            return
+            result = _assemble_workspace_bubble(bubble_id, value)
         except Exception as exc:
             self.fail(f"unexpected {type(exc).__name__}: {exc}")
-        self.assertEqual(first, second)
+        if result is not None:
+            self.assertIsInstance(result, dict)
 
 
 class TestBlobChainParsingFuzz(unittest.TestCase):
@@ -142,21 +233,50 @@ def test_extract_blob_refs_is_idempotent(self, data: bytes) -> None:
 
     @given(data=st.binary(max_size=4096))
     @settings(max_examples=80, deadline=None)
-    def test_blob_classification_never_raises(self, data: bytes) -> None:
+    def test_classify_blob_data_never_raises(self, data: bytes) -> None:
         try:
-            _classify_blob_bytes(data)
+            msg, refs = classify_blob_data(data)
         except Exception as exc:
             self.fail(f"unexpected {type(exc).__name__}: {exc}")
+        if msg is not None:
+            self.assertIsInstance(msg, dict)
+            self.assertEqual(refs, [])
+        else:
+            self.assertIsInstance(refs, list)
+
+    @given(
+        root_id=_BLOB_ID_HEX,
+        extra_ids=st.lists(_BLOB_ID_HEX, max_size=6, unique=True),
+        payloads=st.lists(st.binary(max_size=1024), min_size=1, max_size=8),
+    )
+    @settings(max_examples=40, deadline=None)
+    def test_traverse_blobs_never_raises(
+        self, root_id: str, extra_ids: list[str], payloads: list[bytes]
+    ) -> None:
+        meta = {"latestRootBlobId": root_id, "createdAt": 1_700_000_000_000}
+        blobs: dict[str, bytes] = {root_id: payloads[0]}
+        for i, bid in enumerate(extra_ids):
+            if bid not in blobs:
+                blobs[bid] = payloads[(i + 1) % len(payloads)]
+        with tempfile.TemporaryDirectory() as td:
+            db_path = os.path.join(td, "store.db")
+            _build_store_db_raw(db_path, meta, blobs)
+            try:
+                messages = traverse_blobs(db_path)
+            except Exception as exc:
+                self.fail(f"traverse_blobs raised {type(exc).__name__}: {exc}")
+            self.assertIsInstance(messages, list)
 
 
 class TestTextExtractionFuzz(unittest.TestCase):
     @given(bubble=_BUBBLE_LIKE)
     @settings(max_examples=100, deadline=None)
     def test_extract_text_from_bubble_never_raises(self, bubble: dict) -> None:
         try:
-            extract_text_from_bubble(bubble)
+            text = extract_text_from_bubble(bubble)
         except Exception as exc:
             self.fail(f"unexpected {type(exc).__name__}: {exc}")
+        self.assertIsInstance(text, str)
 
     @given(bubble=_BUBBLE_LIKE)
     @settings(max_examples=80, deadline=None)
@@ -181,9 +301,23 @@ def test_messages_to_bubbles_then_extract_never_raises(
         self.assertIsInstance(bubbles, list)
         for bubble in bubbles:
             try:
-                extract_text_from_bubble(bubble)
+                text = extract_text_from_bubble(bubble)
             except Exception as exc:
                 self.fail(f"extract_text_from_bubble raised {type(exc).__name__}: {exc}")
+            self.assertIsInstance(text, str)
+
+    @given(
+        messages=st.lists(_cli_message(), max_size=12),
+        created_at=st.integers(min_value=0, max_value=2_000_000_000_000),
+    )
+    @settings(max_examples=80, deadline=None)
+    def test_messages_to_bubbles_is_idempotent(
+        self, messages: list[dict], created_at: int
+    ) -> None:
+        self.assertEqual(
+            messages_to_bubbles(messages, created_at),
+            messages_to_bubbles(messages, created_at),
+        )
 
 
 if __name__ == "__main__":
diff --git a/utils/cli_chat_reader.py b/utils/cli_chat_reader.py
@@ -79,6 +79,23 @@ def _extract_blob_refs(data: bytes) -> list[str]:
     return refs
 
 
+def classify_blob_data(data: bytes) -> tuple[dict | None, list[str]]:
+    """Classify a blob payload as a JSON message or a binary chain node.
+
+    Returns ``(message_dict, [])`` when *data* decodes to a dict with a
+    ``role`` field; otherwise ``(None, refs)`` where *refs* are SHA-256 hex
+    ids from :func:`_extract_blob_refs`.  Used by :func:`traverse_blobs` and
+    property tests — keep in sync when the load loop changes.
+    """
+    try:
+        msg = json.loads(data.decode("utf-8"))
+        if isinstance(msg, dict) and "role" in msg:
+            return msg, []
+    except (UnicodeDecodeError, json.JSONDecodeError, TypeError):
+        pass
+    return None, _extract_blob_refs(data)
+
+
 def traverse_blobs(db_path: str) -> list[dict]:
     """Reconstruct the conversation from a ``store.db`` blob graph.
 
@@ -118,15 +135,11 @@ def traverse_blobs(db_path: str) -> list[dict]:
         for blob_id, data in conn.execute("SELECT id, data FROM blobs"):
             if not isinstance(data, bytes):
                 continue
-            try:
-                msg = json.loads(data.decode("utf-8"))
-                if isinstance(msg, dict) and "role" in msg:
-                    json_blobs[blob_id] = msg
-                    continue
-            except (UnicodeDecodeError, json.JSONDecodeError):
-                pass
-            refs = _extract_blob_refs(data)
-            chain_blobs[blob_id] = refs
+            msg, refs = classify_blob_data(data)
+            if msg is not None:
+                json_blobs[blob_id] = msg
+            else:
+                chain_blobs[blob_id] = refs
 
     # BFS from root (newest-first by nature of the linked-list structure);
     # reverse at the end to restore chronological (oldest→newest) order.
diff --git a/utils/text_extract.py b/utils/text_extract.py
@@ -28,9 +28,9 @@ def extract_text_from_bubble(bubble: dict) -> str:
 
     text = ""
 
-    # Try text field first
+    # Try text field first (coerce non-str values — Cursor payloads can drift)
     if bubble.get("text") and str(bubble["text"]).strip():
-        text = bubble["text"]
+        text = str(bubble["text"])
 
     # Fall back to richText
     if not text and bubble.get("richText"):
@@ -49,7 +49,7 @@ def extract_text_from_bubble(bubble: dict) -> str:
                 lang = cb.get("language", "")
                 text += f"\n\n```{lang}\n{cb['content']}\n```"
 
-    return text
+    return text if isinstance(text, str) else ""
 
 
 def slug(s: str) -> str: