fix: drop _to_python_native, let pyarrow handle numpy scalars natively

paddymul · claude · paddymul · commit f6ba18c19ac1 · 2026-03-21T18:05:20.000-04:00
- pyarrow already handles numpy scalars (float64, int64, bool_, nan)
- Replace _to_python_native with _is_complex_for_parquet check
- Fix pd.Series.to_dict() crash on unhashable values (fall back to to_list)
- Update _resolve_all_stats test helpers to handle wide-column format

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/buckaroo/serialization_utils.py b/buckaroo/serialization_utils.py
@@ -268,24 +268,16 @@ def _json_encode_cell(val):
     return json.dumps(_make_json_safe(val), default=str)
 
 
-def _to_python_native(val):
-    """Convert numpy scalars to Python builtins for pyarrow."""
+def _is_complex_for_parquet(val):
+    """Return True if val needs JSON encoding for parquet (not a scalar)."""
     import numpy as np
-    if isinstance(val, np.bool_):
-        return bool(val)
-    if isinstance(val, np.integer):
-        return int(val)
-    if isinstance(val, np.floating):
-        if np.isnan(val):
-            return None
-        return float(val)
-    if isinstance(val, float) and np.isnan(val):
-        return None
-    if isinstance(val, np.ndarray):
-        return val.tolist()
     if isinstance(val, pd.Series):
-        return val.to_dict()
-    return val
+        return True
+    if isinstance(val, np.ndarray):
+        return True
+    if isinstance(val, (list, dict, tuple)):
+        return True
+    return False
 
 
 def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
@@ -310,8 +302,12 @@ def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
             continue
         for stat_name, val in stats.items():
             parquet_col = f"{short_col}__{stat_name}"
-            val = _to_python_native(val)
-            if isinstance(val, (list, dict, tuple)):
+            if isinstance(val, pd.Series):
+                try:
+                    val = val.to_dict()
+                except TypeError:
+                    val = val.to_list()
+            if _is_complex_for_parquet(val):
                 val = json.dumps(_make_json_safe(val), default=str)
             wide_data[parquet_col] = [val]
 
diff --git a/tests/unit/lazy_infinite_polars_widget_test.py b/tests/unit/lazy_infinite_polars_widget_test.py
@@ -1,5 +1,4 @@
 import polars as pl
-import pandas as pd
 import base64
 from io import BytesIO
 import json
@@ -13,14 +12,49 @@
 
 
 def _resolve_all_stats(all_stats):
-    """Resolve all_stats to a list of row dicts, whether it's JSON or parquet_b64."""
+    """Resolve all_stats to a list of row dicts, whether it's JSON or parquet_b64.
+
+    Handles both old row-based and new wide-column (col__stat) formats.
+    """
     if isinstance(all_stats, list):
         return all_stats
     if isinstance(all_stats, dict) and all_stats.get('format') == 'parquet_b64':
+        import pyarrow.parquet as pq
         raw = base64.b64decode(all_stats['data'])
-        df = pd.read_parquet(BytesIO(raw), engine='pyarrow')
+        table = pq.read_table(BytesIO(raw))
+        col_names = table.column_names
+
+        # Detect wide format: column names contain '__'
+        if any('__' in c for c in col_names):
+            row_dict = table.to_pydict()
+            stat_cols = {}
+            all_cols = set()
+            for key in col_names:
+                sep = key.index('__')
+                col, stat = key[:sep], key[sep+2:]
+                all_cols.add(col)
+                if stat not in stat_cols:
+                    stat_cols[stat] = {}
+                val = row_dict[key][0]
+                if isinstance(val, str):
+                    try:
+                        parsed = json.loads(val)
+                        if isinstance(parsed, (list, dict)):
+                            val = parsed
+                    except (json.JSONDecodeError, ValueError):
+                        pass
+                stat_cols[stat][col] = val
+            rows = []
+            for stat, cols in stat_cols.items():
+                row = {'index': stat, 'level_0': stat}
+                for c in sorted(all_cols):
+                    row[c] = cols.get(c)
+                rows.append(row)
+            return rows
+
+        # Old row-based format fallback
+        df = table.to_pandas()
         rows = json.loads(df.to_json(orient='records'))
-        # JSON-parse each cell (they were JSON-encoded on the Python side)
         parsed_rows = []
         for row in rows:
             parsed = {}
diff --git a/tests/unit/polars_basic_widget_test.py b/tests/unit/polars_basic_widget_test.py
@@ -5,7 +5,6 @@
 import polars as pl
 from polars import functions as F
 import numpy as np
-import pandas as pd
 from buckaroo.pluggable_analysis_framework.polars_analysis_management import (
     PolarsAnalysis, polars_produce_series_df)
 from buckaroo.pluggable_analysis_framework.col_analysis import (
@@ -19,12 +18,48 @@
 
 
 def _resolve_all_stats(all_stats):
-    """Resolve all_stats to a list of row dicts, whether it's JSON or parquet_b64."""
+    """Resolve all_stats to a list of row dicts, whether it's JSON or parquet_b64.
+
+    Handles both old row-based and new wide-column (col__stat) formats.
+    """
     if isinstance(all_stats, list):
         return all_stats
     if isinstance(all_stats, dict) and all_stats.get('format') == 'parquet_b64':
+        import pyarrow.parquet as pq
         raw = base64.b64decode(all_stats['data'])
-        df = pd.read_parquet(BytesIO(raw), engine='pyarrow')
+        table = pq.read_table(BytesIO(raw))
+        col_names = table.column_names
+
+        # Detect wide format: column names contain '__'
+        if any('__' in c for c in col_names):
+            row_dict = table.to_pydict()
+            stat_cols = {}  # stat -> {col -> value}
+            all_cols = set()
+            for key in col_names:
+                sep = key.index('__')
+                col, stat = key[:sep], key[sep+2:]
+                all_cols.add(col)
+                if stat not in stat_cols:
+                    stat_cols[stat] = {}
+                val = row_dict[key][0]
+                if isinstance(val, str):
+                    try:
+                        parsed = json.loads(val)
+                        if isinstance(parsed, (list, dict)):
+                            val = parsed
+                    except (json.JSONDecodeError, ValueError):
+                        pass
+                stat_cols[stat][col] = val
+            rows = []
+            for stat, cols in stat_cols.items():
+                row = {'index': stat, 'level_0': stat}
+                for c in sorted(all_cols):
+                    row[c] = cols.get(c)
+                rows.append(row)
+            return rows
+
+        # Old row-based format fallback
+        df = table.to_pandas()
         rows = json.loads(df.to_json(orient='records'))
         parsed_rows = []
         for row in rows:
diff --git a/tests/unit/test_sd_to_parquet_b64.py b/tests/unit/test_sd_to_parquet_b64.py
@@ -11,7 +11,7 @@
 import pandas as pd
 import pyarrow.parquet as pq
 
-from buckaroo.serialization_utils import sd_to_parquet_b64, _to_python_native
+from buckaroo.serialization_utils import sd_to_parquet_b64
 
 
 def _decode_parquet_b64(result):
@@ -144,14 +144,14 @@ def test_sd_to_parquet_b64_multiple_columns():
     assert row['b__dtype'] == ['int64']
 
 
-def test_sd_to_parquet_b64_nan_becomes_null():
-    """NaN values should become parquet nulls."""
+def test_sd_to_parquet_b64_nan_preserved():
+    """NaN values should survive the parquet round-trip."""
     sd = {'col': {'mean': np.nan, 'dtype': 'float64'}}
     result = sd_to_parquet_b64(sd)
     table = _decode_parquet_b64(result)
     row = table.to_pydict()
 
-    assert row['a__mean'] == [None]
+    assert np.isnan(row['a__mean'][0])
     assert row['a__dtype'] == ['float64']
 
 
@@ -173,22 +173,23 @@ def test_sd_to_parquet_b64_value_counts_series():
     assert parsed == {'foo': 10, 'bar': 5}
 
 
-def test_to_python_native_conversions():
-    assert _to_python_native(np.float64(3.14)) == 3.14
-    assert isinstance(_to_python_native(np.float64(3.14)), float)
-
-    assert _to_python_native(np.int64(42)) == 42
-    assert isinstance(_to_python_native(np.int64(42)), int)
-
-    assert _to_python_native(np.bool_(True)) is True
-    assert isinstance(_to_python_native(np.bool_(True)), bool)
-
-    assert _to_python_native(np.nan) is None
-
-    arr = np.array([1, 2, 3])
-    assert _to_python_native(arr) == [1, 2, 3]
+def test_numpy_scalars_handled_natively_by_pyarrow():
+    """pyarrow handles numpy scalars without manual conversion."""
+    sd = {
+        'col': {
+            'mean': np.float64(3.14),
+            'count': np.int64(42),
+            'is_numeric': np.bool_(True),
+            'nan_val': np.nan,
+        },
+    }
+    result = sd_to_parquet_b64(sd)
+    table = _decode_parquet_b64(result)
+    row = table.to_pydict()
 
-    assert _to_python_native("hello") == "hello"
-    assert _to_python_native(None) is None
+    assert row['a__mean'] == [3.14]
+    assert row['a__count'] == [42]
+    assert row['a__is_numeric'] == [True]
+    assert np.isnan(row['a__nan_val'][0])