buckaroo-data
diff --git a/‎buckaroo/serialization_utils.py‎
Lines changed: 42 additions & 18 deletions b/‎buckaroo/serialization_utils.py‎
Lines changed: 42 additions & 18 deletions
diff --git a/‎packages/buckaroo-js-core/src/components/DFViewerParts/resolveDFData.test.ts‎
Lines changed: 107 additions & 46 deletions b/‎packages/buckaroo-js-core/src/components/DFViewerParts/resolveDFData.test.ts‎
Lines changed: 107 additions & 46 deletions
diff --git a/‎packages/buckaroo-js-core/src/components/DFViewerParts/resolveDFData.ts‎
Lines changed: 64 additions & 26 deletions b/‎packages/buckaroo-js-core/src/components/DFViewerParts/resolveDFData.ts‎
Lines changed: 64 additions & 26 deletions
@@ -268,34 +268,58 @@ def _json_encode_cell(val):
     return json.dumps(_make_json_safe(val), default=str)
 
 
+def _to_python_native(val):
+    """Convert numpy scalars to Python builtins for pyarrow."""
+    import numpy as np
+    if isinstance(val, np.bool_):
+        return bool(val)
+    if isinstance(val, np.integer):
+        return int(val)
+    if isinstance(val, np.floating):
+        if np.isnan(val):
+            return None
+        return float(val)
+    if isinstance(val, float) and np.isnan(val):
+        return None
+    if isinstance(val, np.ndarray):
+        return val.tolist()
+    if isinstance(val, pd.Series):
+        return val.to_dict()
+    return val
+
+
 def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
     """Convert a summary stats dict to a tagged parquet-b64 payload.
 
-    Summary stats DataFrames have mixed-type columns (strings, numbers, lists)
-    which fastparquet can't handle directly. We JSON-encode every cell value
-    first so each column becomes a pure string column, then use pyarrow for
-    parquet serialization. The JS side decodes parquet then JSON.parse's each cell.
+    Uses a wide-column layout: one parquet column per (col, stat) pair.
+    Column names are ``{short_col}__{stat_name}`` (e.g. ``a__mean``).
+    The parquet file has a single row. Scalars (numbers, strings, bools)
+    go through parquet natively. Lists/dicts are JSON-encoded.
 
     Returns {'format': 'parquet_b64', 'data': '<base64 string>'}
     Falls back to JSON if parquet serialization fails.
     """
-    # JSON-encode every value so parquet sees only string columns
-    json_sd: Dict[str, Any] = {}
-    for col, stats in sd.items():
-        if isinstance(stats, dict):
-            json_sd[col] = {k: _json_encode_cell(v) for k, v in stats.items()}
-        else:
-            json_sd[col] = stats
-
-    df = pd.DataFrame(json_sd)
-    df2 = prepare_df_for_serialization(df)
-    # Add level_0 for backwards compatibility with JSON path (pd_to_obj adds it)
-    if not isinstance(df.index, pd.MultiIndex):
-        df2['level_0'] = df2['index']
+    import pyarrow as pa
+
+    col_mapping = [(orig, to_chars(i)) for i, orig in enumerate(sd.keys())]
+    wide_data: Dict[str, List] = {}
+
+    for orig_col, short_col in col_mapping:
+        stats = sd[orig_col]
+        if not isinstance(stats, dict):
+            continue
+        for stat_name, val in stats.items():
+            parquet_col = f"{short_col}__{stat_name}"
+            val = _to_python_native(val)
+            if isinstance(val, (list, dict, tuple)):
+                val = json.dumps(_make_json_safe(val), default=str)
+            wide_data[parquet_col] = [val]
 
     try:
+        table = pa.table(wide_data)
         data = BytesIO()
-        df2.to_parquet(data, engine='pyarrow')
+        import pyarrow.parquet as pq
+        pq.write_table(table, data)
         data.seek(0)
         raw_bytes = data.read()
         b64 = base64.b64encode(raw_bytes).decode('ascii')
 
@@ -1,9 +1,8 @@
 import { parquetRead, parquetMetadata } from 'hyparquet';
-import { resolveDFData, resolveDFDataAsync } from './resolveDFData';
+import { resolveDFData, resolveDFDataAsync, pivotWideSummaryStats } from './resolveDFData';
 import { DFData, DFDataRow, ParquetB64Payload } from './DFWhole';
 
-// Fixture generated by Python's sd_to_parquet_b64() with a summary stats dict
-// containing numeric histogram data for one column.
+// Fixture generated by Python's sd_to_parquet_b64() with wide-column layout.
 // eslint-disable-next-line @typescript-eslint/no-var-requires
 const fixture = require('./test-fixtures/summary_stats_parquet_b64.json');
 const parquetPayload: ParquetB64Payload = fixture as ParquetB64Payload;
@@ -29,9 +28,7 @@ describe('resolveDFData', () => {
         expect(resolveDFData(data)).toBe(data);
     });
 
-    it('hyparquet can read the parquet_b64 fixture', async () => {
-        // Verify the fixture is valid and hyparquet can decode it.
-        // This is independent of resolveDFData — it tests the raw decode path.
+    it('hyparquet can read the wide-format parquet_b64 fixture', async () => {
         const buf = b64ToArrayBuffer(parquetPayload.data);
         const metadata = parquetMetadata(buf);
         expect(metadata.row_groups.length).toBeGreaterThan(0);
@@ -44,60 +41,124 @@ describe('resolveDFData', () => {
             onComplete: (data: any[]) => { rows.push(...data); },
         });
 
-        expect(rows.length).toBeGreaterThan(0);
+        // Wide format: single row with col__stat columns
+        expect(rows.length).toBe(1);
+        const keys = Object.keys(rows[0]);
+        expect(keys.some(k => k.includes('__'))).toBe(true);
+        expect(keys).toContain('a__mean');
+        expect(keys).toContain('b__dtype');
+    });
 
-        // Should have an 'index' column with stat names
-        const indices = rows.map(r => r.index).filter(Boolean);
-        expect(indices).toContain('histogram');
-        expect(indices).toContain('dtype');
+    it('sync resolveDFData returns [] for parquet_b64 (known async limitation)', () => {
+        const result = resolveDFData(parquetPayload);
+        expect(result.length).toBe(0);
     });
 
-    it('parquet_b64 histogram data round-trips with correct types', async () => {
-        // Decode the fixture and verify histogram arrays have the right structure.
-        const buf = b64ToArrayBuffer(parquetPayload.data);
-        const metadata = parquetMetadata(buf);
+    it('async resolveDFDataAsync returns pivoted DFData for wide-format parquet', async () => {
+        const result = await resolveDFDataAsync(parquetPayload);
+        expect(result.length).toBeGreaterThan(0);
 
-        const rows: DFDataRow[] = [];
-        await parquetRead({
-            file: buf,
-            metadata,
-            rowFormat: 'object',
-            onComplete: (data: any[]) => { rows.push(...data); },
-        });
+        // Should have row-based format with index column
+        const meanRow = result.find(r => r.index === 'mean');
+        expect(meanRow).toBeDefined();
+        expect(meanRow!.a).toBe(50.0);
+        expect(meanRow!.b).toBe(22.0);
 
-        const histRow = rows.find(r => r.index === 'histogram');
-        expect(histRow).toBeDefined();
+        const dtypeRow = result.find(r => r.index === 'dtype');
+        expect(dtypeRow).toBeDefined();
+        expect(dtypeRow!.a).toBe('float64');
+        expect(dtypeRow!.b).toBe('int64');
+    });
 
-        // Column 'a' contains the JSON-encoded histogram array
-        const rawCell = histRow!['a'];
-        expect(typeof rawCell).toBe('string');
+    it('async decode produces histogram arrays from JSON strings', async () => {
+        const result = await resolveDFDataAsync(parquetPayload);
 
-        const parsed = JSON.parse(rawCell as string);
-        expect(Array.isArray(parsed)).toBe(true);
-        expect(parsed.length).toBeGreaterThan(0);
+        const histRow = result.find(r => r.index === 'histogram');
+        expect(histRow).toBeDefined();
+        expect(Array.isArray(histRow!.a)).toBe(true);
+        const hist = histRow!.a as any[];
+        expect(hist.length).toBe(5);
+        expect(typeof hist[0].population).toBe('number');
+        expect(hist[0].name).toBe('0-20');
+    });
 
-        // Verify types: population should be a number, not a string
-        const popBar = parsed.find((b: any) => b.population !== undefined);
-        expect(popBar).toBeDefined();
-        expect(typeof popBar.population).toBe('number');
-        expect(typeof parsed[0].name).toBe('string');
+    it('async decode produces histogram_bins arrays', async () => {
+        const result = await resolveDFDataAsync(parquetPayload);
+
+        const binsRow = result.find(r => r.index === 'histogram_bins');
+        expect(binsRow).toBeDefined();
+        expect(Array.isArray(binsRow!.a)).toBe(true);
+        expect((binsRow!.a as number[]).length).toBe(6);
     });
+});
 
-    it('sync resolveDFData returns [] for parquet_b64 (known async limitation)', () => {
-        // Documents #630: parquetRead is async so the sync wrapper returns [].
-        // Widget components use useResolvedDFDataDict which falls back to async.
-        // The static embed path uses resolveDFDataAsync which works correctly.
-        const result = resolveDFData(parquetPayload);
-        expect(result.length).toBe(0);
+describe('pivotWideSummaryStats', () => {
+    it('pivots a wide row into row-based DFData', () => {
+        const wideRow = {
+            a__mean: 42.5,
+            a__dtype: 'float64',
+            b__mean: 10.0,
+            b__dtype: 'int64',
+        };
+        const result = pivotWideSummaryStats(wideRow);
+
+        const meanRow = result.find(r => r.index === 'mean');
+        expect(meanRow).toBeDefined();
+        expect(meanRow!.a).toBe(42.5);
+        expect(meanRow!.b).toBe(10.0);
+        expect(meanRow!.level_0).toBe('mean');
+
+        const dtypeRow = result.find(r => r.index === 'dtype');
+        expect(dtypeRow).toBeDefined();
+        expect(dtypeRow!.a).toBe('float64');
+        expect(dtypeRow!.b).toBe('int64');
     });
 
-    it('async resolveDFDataAsync returns non-empty result for parquet_b64', async () => {
-        const result = await resolveDFDataAsync(parquetPayload);
-        expect(result.length).toBeGreaterThan(0);
+    it('JSON-parses list/object values in string cells', () => {
+        const wideRow = {
+            a__histogram: '[{"name": "foo", "population": 10}]',
+            a__dtype: 'float64',
+        };
+        const result = pivotWideSummaryStats(wideRow);
 
-        // Verify the histogram row was JSON-parsed correctly
         const histRow = result.find(r => r.index === 'histogram');
         expect(histRow).toBeDefined();
-        expect(Array.isArray(histRow!['a'])).toBe(true);
+        expect(Array.isArray(histRow!.a)).toBe(true);
+        expect((histRow!.a as any[])[0].population).toBe(10);
+    });
+
+    it('keeps plain strings as strings (not JSON-parsed)', () => {
+        const wideRow = {
+            a__dtype: 'float64',
+        };
+        const result = pivotWideSummaryStats(wideRow);
+        const row = result.find(r => r.index === 'dtype');
+        expect(row!.a).toBe('float64');
+    });
+
+    it('handles null values', () => {
+        const wideRow = {
+            a__mean: null,
+            a__dtype: 'float64',
+        };
+        const result = pivotWideSummaryStats(wideRow);
+        const meanRow = result.find(r => r.index === 'mean');
+        expect(meanRow!.a).toBeNull();
+    });
+
+    it('fills missing columns with null', () => {
+        const wideRow = {
+            a__mean: 42,
+            b__dtype: 'int64',
+        };
+        const result = pivotWideSummaryStats(wideRow);
+
+        const meanRow = result.find(r => r.index === 'mean');
+        expect(meanRow!.a).toBe(42);
+        expect(meanRow!.b).toBeNull();
+
+        const dtypeRow = result.find(r => r.index === 'dtype');
+        expect(dtypeRow!.a).toBeNull();
+        expect(dtypeRow!.b).toBe('int64');
     });
 });
@@ -41,38 +41,68 @@ function b64ToArrayBuffer(b64: string): ArrayBuffer {
 }
 
 /**
- * JSON-parse each cell value in a row from parquet-decoded data.
+ * Pivot a wide single-row parquet result (col__stat columns) back to
+ * row-based DFData that downstream consumers expect.
  *
- * The Python side JSON-encodes every cell before writing to parquet
- * (because summary stats have mixed types per column). We need to
- * JSON.parse each value back to its original type.
- *
- * The 'index' column is left as a plain string (stat name like 'mean', 'dtype').
+ * Input: single row object like {a__mean: 42, a__dtype: "float64", b__mean: 10, ...}
+ * Output: DFData rows like [{index: "mean", level_0: "mean", a: 42, b: 10}, ...]
  */
-function parseParquetRow(row: Record<string, any>): DFDataRow {
-    const parsed: DFDataRow = {};
-    for (const [key, val] of Object.entries(row)) {
-        if (key === 'index' || key === 'level_0') {
-            // index/level_0 columns are stat names — keep as-is
-            // BigInt from hyparquet INT64 columns must be converted to Number
-            parsed[key] = typeof val === 'bigint' ? Number(val) : val;
-        } else if (typeof val === 'string') {
+export function pivotWideSummaryStats(wideRow: Record<string, any>): DFData {
+    // Group values by stat name: stat -> {col -> value}
+    const statCols: Record<string, Record<string, any>> = {};
+    const allCols = new Set<string>();
+
+    for (const [key, val] of Object.entries(wideRow)) {
+        const sepIdx = key.indexOf('__');
+        if (sepIdx === -1) continue;
+        const col = key.substring(0, sepIdx);
+        const stat = key.substring(sepIdx + 2);
+        allCols.add(col);
+        if (!statCols[stat]) statCols[stat] = {};
+
+        // JSON-parse string values that are JSON arrays/objects
+        if (typeof val === 'string') {
             try {
-                parsed[key] = JSON.parse(val);
+                const parsed = JSON.parse(val);
+                if (typeof parsed === 'object' && parsed !== null) {
+                    statCols[stat][col] = parsed;
+                    continue;
+                }
             } catch {
-                parsed[key] = val;
+                // not JSON, keep as string
             }
-        } else if (typeof val === 'bigint') {
-            // hyparquet decodes INT64 as BigInt; use Number only if safe,
-            // otherwise stringify to preserve precision (fixes #627)
+        }
+        // BigInt conversion (hyparquet INT64)
+        if (typeof val === 'bigint') {
             const MAX_SAFE = BigInt(Number.MAX_SAFE_INTEGER);
-            parsed[key] = val >= -MAX_SAFE && val <= MAX_SAFE
+            statCols[stat][col] = val >= -MAX_SAFE && val <= MAX_SAFE
                 ? Number(val) : String(val);
-        } else {
-            parsed[key] = val;
+            continue;
+        }
+        statCols[stat][col] = val;
+    }
+
+    // Build DFData: one row per stat
+    const colList = Array.from(allCols);
+    const rows: DFData = [];
+    for (const [stat, cols] of Object.entries(statCols)) {
+        const row: DFDataRow = { index: stat, level_0: stat };
+        for (let i = 0; i < colList.length; i++) {
+            const col = colList[i];
+            row[col] = cols[col] ?? null;
         }
+        rows.push(row);
     }
-    return parsed;
+    return rows;
+}
+
+/**
+ * Detect wide-column format: single row where column names contain '__'.
+ */
+function isWideFormat(rows: any[]): boolean {
+    if (rows.length !== 1) return false;
+    const keys = Object.keys(rows[0]);
+    return keys.some(k => k.indexOf('__') !== -1);
 }
 
 /**
@@ -106,8 +136,11 @@ export function resolveDFData(val: DFDataOrPayload | undefined | null): DFData {
                 metadata,
                 rowFormat: 'object',
                 onComplete: (data: any[]) => {
-                    // JSON-parse each cell to recover typed values
-                    result = (data as DFDataRow[]).map(parseParquetRow);
+                    if (isWideFormat(data)) {
+                        result = pivotWideSummaryStats(data[0] as Record<string, any>);
+                    } else {
+                        result = data as DFData;
+                    }
                     cacheSet(val.data, result);
                 },
             });
@@ -156,7 +189,12 @@ export async function resolveDFDataAsync(val: DFDataOrPayload | undefined | null
                     reject(e);
                 }
             });
-            const result = (data as DFDataRow[]).map(parseParquetRow);
+            let result: DFData;
+            if (isWideFormat(data)) {
+                result = pivotWideSummaryStats(data[0] as Record<string, any>);
+            } else {
+                result = data as DFData;
+            }
             cacheSet(val.data, result);
             return result;
         } catch (e) {