Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/cachekit/serializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def _get_arrow_serializer() -> type:
# This allows passing enable_integrity_checking parameter during instantiation
SERIALIZER_REGISTRY = {
"auto": AutoSerializer, # Python-specific types (NumPy, pandas, datetime optimization)
"pythonic": AutoSerializer, # Alias — preserves Python types (tuples, sets, frozensets, datetime, UUID)
"default": StandardSerializer, # Language-agnostic MessagePack for multi-language caches
"std": StandardSerializer, # Explicit StandardSerializer alias
"arrow": None, # Lazy-loaded: requires pyarrow from [data] extra
Expand Down Expand Up @@ -121,7 +122,7 @@ def get_serializer(name: str, enable_integrity_checking: bool = True) -> Seriali
serializer_class = SERIALIZER_REGISTRY[name]

# Instantiate with integrity checking configuration
if name in ("default", "std", "auto", "arrow", "orjson"):
if name in ("default", "std", "auto", "pythonic", "arrow", "orjson"):
# All core serializers use enable_integrity_checking parameter
serializer = serializer_class(enable_integrity_checking=enable_integrity_checking)
else:
Expand Down
33 changes: 32 additions & 1 deletion src/cachekit/serializers/auto_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- datetime/date/time (ISO-8601)
- UUID (string representation)
- set/frozenset (type-safe roundtrip)
- tuple (recursive type-safe roundtrip)

Uses MessagePack as the default format with graceful degradation for optional dependencies.

Expand Down Expand Up @@ -73,7 +74,7 @@

CUSTOM_CLASS_ERROR_MESSAGE = (
"AutoSerializer does not support custom classes. "
"Supported types: dict, list, str, int, float, bool, None, bytes, "
"Supported types: dict, list, tuple, str, int, float, bool, None, bytes, "
"datetime, date, time, UUID, set, frozenset, NumPy arrays, pandas DataFrames.\n"
"Options:\n"
" 1. Convert to dict manually\n"
Expand Down Expand Up @@ -124,6 +125,26 @@ def _safe_hasattr(obj: Any, attr: str) -> bool:
return False


def _wrap_tuples(obj: Any) -> Any:
"""Recursively wrap tuples in type markers before msgpack encoding.

Msgpack natively serializes tuples as arrays (same as lists), so the
``default`` callback is never called for them. This pre-processor
converts tuples to ``{"__tuple__": True, "value": [...]}`` markers
that ``_auto_object_hook`` restores on deserialization.

Only affects tuples — all other types pass through unchanged and are
handled by msgpack's ``default`` callback (``_auto_default``).
"""
if isinstance(obj, tuple):
return {"__tuple__": True, "value": [_wrap_tuples(x) for x in obj]}
if isinstance(obj, list):
return [_wrap_tuples(x) for x in obj]
if isinstance(obj, dict):
return {k: _wrap_tuples(v) for k, v in obj.items()}
return obj


def _auto_default(obj: Any) -> Any:
"""Custom encoder for types not natively supported by MessagePack.

Expand Down Expand Up @@ -226,6 +247,14 @@ def _auto_object_hook(obj: Any) -> Any:
except (ValueError, TypeError) as e:
raise SerializationError(f"Invalid UUID format in cached data: {value}") from e

if obj.get("__tuple__") is True:
if "value" not in obj:
raise SerializationError("Invalid tuple format: missing 'value' field in cached data")
value_list = obj["value"]
if not isinstance(value_list, list):
raise SerializationError(f"Invalid tuple format: expected list, got {type(value_list).__name__}")
return tuple(value_list)

if obj.get("__set__") is True:
if "value" not in obj:
raise SerializationError("Invalid set format: missing 'value' field in cached data")
Expand Down Expand Up @@ -748,6 +777,8 @@ def _deserialize_series(self, data) -> pd.Series:

def _serialize_msgpack(self, obj: Any) -> bytes:
"""Serialize general object with MessagePack."""
# Pre-process tuples into markers (msgpack natively flattens them to lists)
obj = _wrap_tuples(obj)
msgpack_data = msgpack.packb(obj, **self._msgpack_pack_opts)

if self.enable_integrity_checking:
Expand Down
96 changes: 95 additions & 1 deletion tests/unit/test_auto_serializer_new_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Unit tests for AutoSerializer new type support (UUID, set, frozenset).
"""Unit tests for AutoSerializer new type support (UUID, set, frozenset, tuple).

Tests:
- UUID serialization roundtrip
Expand Down Expand Up @@ -559,3 +559,97 @@ def test_uuid_deterministic_serialization_property(self, uuid_list: list[UUID]):
result1 = serializer.deserialize(bytes1)
result2 = serializer.deserialize(bytes2)
assert result1 == result2


class TestAutoSerializerTuple:
"""Test tuple preservation through AutoSerializer roundtrip."""

def test_simple_tuple_roundtrip(self):
serializer = AutoSerializer()
data = (1, 2, 3)
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result, tuple)
assert result == (1, 2, 3)

def test_nested_tuple_roundtrip(self):
serializer = AutoSerializer()
data = (1, (2, 3), (4, (5, 6)))
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result, tuple)
assert isinstance(result[1], tuple)
assert isinstance(result[2][1], tuple)
assert result == (1, (2, 3), (4, (5, 6)))

def test_empty_tuple_roundtrip(self):
serializer = AutoSerializer()
data = ()
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result, tuple)
assert result == ()

def test_tuple_in_dict(self):
serializer = AutoSerializer()
data = {"key": (1, 2), "other": "value"}
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result["key"], tuple)
assert result["key"] == (1, 2)

def test_tuple_in_list(self):
serializer = AutoSerializer()
data = [(1, 2), (3, 4)]
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result[0], tuple)
assert isinstance(result[1], tuple)

def test_tuple_with_set_and_datetime(self):
"""Tuple containing other special types that use _auto_default."""
from datetime import datetime

serializer = AutoSerializer()
data = (1, {2, 3}, datetime(2025, 1, 1))
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result, tuple)
assert isinstance(result[1], set)
assert isinstance(result[2], datetime)

def test_list_preserved_as_list_not_tuple(self):
"""Lists must stay as lists — only tuples get markers."""
serializer = AutoSerializer()
data = [1, 2, 3]
serialized, metadata = serializer.serialize(data)
result = serializer.deserialize(serialized, metadata)
assert isinstance(result, list)

def test_malformed_tuple_marker_missing_value(self):
"""Malformed __tuple__ marker raises SerializationError."""
import msgpack

serializer = AutoSerializer(enable_integrity_checking=False)
bad_data = msgpack.packb({"__tuple__": True})
with pytest.raises(SerializationError, match="missing 'value' field"):
serializer.deserialize(bad_data)

def test_malformed_tuple_marker_wrong_value_type(self):
"""__tuple__ marker with non-list value raises SerializationError."""
import msgpack

serializer = AutoSerializer(enable_integrity_checking=False)
bad_data = msgpack.packb({"__tuple__": True, "value": "not a list"})
with pytest.raises(SerializationError, match="expected list"):
serializer.deserialize(bad_data)
Comment on lines +629 to +645
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Description: Check if enable_integrity_checking=False is necessary in tuple corruption tests

# Search for all corruption tests with enable_integrity_checking
rg -n "enable_integrity_checking" tests/unit/test_auto_serializer_new_types.py -A 2 -B 2

# Search for corruption test patterns to compare tuple vs other types
rg -n "def test_.*_missing_value_field" tests/unit/test_auto_serializer_new_types.py -A 5

Repository: cachekit-io/cachekit-py

Length of output: 2364


🏁 Script executed:

# Find AutoSerializer class definition
fd AutoSerializer src/ --type f | head -20

Repository: cachekit-io/cachekit-py

Length of output: 49


🏁 Script executed:

# Search for enable_integrity_checking usage and documentation
rg -n "enable_integrity_checking" --type py | head -40

Repository: cachekit-io/cachekit-py

Length of output: 4796


🏁 Script executed:

# Search for tuple deserialization logic
rg -n "__tuple__" --type py -A 3 -B 3 | head -60

Repository: cachekit-io/cachekit-py

Length of output: 3924


🏁 Script executed:

# Check ByteStorage code and its interaction with deserialization
rg -n "ByteStorage" src/cachekit/serializers/auto_serializer.py -B 2 -A 5 | head -50

Repository: cachekit-io/cachekit-py

Length of output: 1615


🏁 Script executed:

# Check if there's any special handling of integrity checking in deserialization flow
rg -n "_auto_object_hook|__tuple__" src/cachekit/serializers/auto_serializer.py -B 3 -A 8 | head -80

Repository: cachekit-io/cachekit-py

Length of output: 2636


🏁 Script executed:

# Check if any other test file tests tuple with or without the flag
rg -n "test_.*tuple" tests/unit/ -A 2 -B 1 | grep -E "(def test|enable_integrity)" | head -20

Repository: cachekit-io/cachekit-py

Length of output: 1296


Remove unnecessary enable_integrity_checking=False flag for consistency.

Both tuple corruption tests use enable_integrity_checking=False, but this flag controls ByteStorage compression/integrity wrapper behavior—not tuple marker validation. The tuple validation (checking for missing 'value' field and correct value type) occurs in _auto_object_hook during msgpack deserialization, independent of this flag. All other corruption tests (UUID, datetime, date, time, set, frozenset) and all other tuple roundtrip tests omit this flag. Remove it from both test methods to align with the pattern used throughout the file.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@tests/unit/test_auto_serializer_new_types.py` around lines 629 - 645, Update
the two tests test_malformed_tuple_marker_missing_value and
test_malformed_tuple_marker_wrong_value_type to remove the unnecessary
enable_integrity_checking=False argument when instantiating AutoSerializer; the
tuple validation runs in _auto_object_hook during msgpack deserialization and is
independent of ByteStorage integrity/compression, so change "serializer =
AutoSerializer(enable_integrity_checking=False)" to "serializer =
AutoSerializer()" in both tests to match the pattern used by other
corruption/roundtrip tests.



class TestPythonicSerializerAlias:
"""Test 'pythonic' alias for AutoSerializer."""

def test_pythonic_returns_auto_serializer(self):
from cachekit.serializers import get_serializer

s = get_serializer("pythonic")
assert isinstance(s, AutoSerializer)
Loading