Skip to content

Commit 4e5cc8e

Browse files
paddymulclaude
andcommitted
feat: wide-column layout for summary stats parquet encoding (#646)
Replace JSON-in-Parquet summary stats with one-column-per-cell layout. Each parquet column is named col__stat (e.g. a__mean, b__histogram). Scalars go through parquet natively; only lists/dicts are JSON-encoded. JS side pivots the wide single-row back to row-based DFData that all downstream consumers (extractSDFT, extractPinnedRows, AG-Grid) expect. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c57876e commit 4e5cc8e

5 files changed

Lines changed: 329 additions & 131 deletions

File tree

buckaroo/serialization_utils.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -268,34 +268,58 @@ def _json_encode_cell(val):
268268
return json.dumps(_make_json_safe(val), default=str)
269269

270270

271+
def _to_python_native(val):
272+
"""Convert numpy scalars to Python builtins for pyarrow."""
273+
import numpy as np
274+
if isinstance(val, np.bool_):
275+
return bool(val)
276+
if isinstance(val, np.integer):
277+
return int(val)
278+
if isinstance(val, np.floating):
279+
if np.isnan(val):
280+
return None
281+
return float(val)
282+
if isinstance(val, float) and np.isnan(val):
283+
return None
284+
if isinstance(val, np.ndarray):
285+
return val.tolist()
286+
if isinstance(val, pd.Series):
287+
return val.to_dict()
288+
return val
289+
290+
271291
def sd_to_parquet_b64(sd: Dict[str, Any]) -> Dict[str, str]:
272292
"""Convert a summary stats dict to a tagged parquet-b64 payload.
273293
274-
Summary stats DataFrames have mixed-type columns (strings, numbers, lists)
275-
which fastparquet can't handle directly. We JSON-encode every cell value
276-
first so each column becomes a pure string column, then use pyarrow for
277-
parquet serialization. The JS side decodes parquet then JSON.parse's each cell.
294+
Uses a wide-column layout: one parquet column per (col, stat) pair.
295+
Column names are ``{short_col}__{stat_name}`` (e.g. ``a__mean``).
296+
The parquet file has a single row. Scalars (numbers, strings, bools)
297+
go through parquet natively. Lists/dicts are JSON-encoded.
278298
279299
Returns {'format': 'parquet_b64', 'data': '<base64 string>'}
280300
Falls back to JSON if parquet serialization fails.
281301
"""
282-
# JSON-encode every value so parquet sees only string columns
283-
json_sd: Dict[str, Any] = {}
284-
for col, stats in sd.items():
285-
if isinstance(stats, dict):
286-
json_sd[col] = {k: _json_encode_cell(v) for k, v in stats.items()}
287-
else:
288-
json_sd[col] = stats
289-
290-
df = pd.DataFrame(json_sd)
291-
df2 = prepare_df_for_serialization(df)
292-
# Add level_0 for backwards compatibility with JSON path (pd_to_obj adds it)
293-
if not isinstance(df.index, pd.MultiIndex):
294-
df2['level_0'] = df2['index']
302+
import pyarrow as pa
303+
304+
col_mapping = [(orig, to_chars(i)) for i, orig in enumerate(sd.keys())]
305+
wide_data: Dict[str, List] = {}
306+
307+
for orig_col, short_col in col_mapping:
308+
stats = sd[orig_col]
309+
if not isinstance(stats, dict):
310+
continue
311+
for stat_name, val in stats.items():
312+
parquet_col = f"{short_col}__{stat_name}"
313+
val = _to_python_native(val)
314+
if isinstance(val, (list, dict, tuple)):
315+
val = json.dumps(_make_json_safe(val), default=str)
316+
wide_data[parquet_col] = [val]
295317

296318
try:
319+
table = pa.table(wide_data)
297320
data = BytesIO()
298-
df2.to_parquet(data, engine='pyarrow')
321+
import pyarrow.parquet as pq
322+
pq.write_table(table, data)
299323
data.seek(0)
300324
raw_bytes = data.read()
301325
b64 = base64.b64encode(raw_bytes).decode('ascii')
Lines changed: 107 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import { parquetRead, parquetMetadata } from 'hyparquet';
2-
import { resolveDFData, resolveDFDataAsync } from './resolveDFData';
2+
import { resolveDFData, resolveDFDataAsync, pivotWideSummaryStats } from './resolveDFData';
33
import { DFData, DFDataRow, ParquetB64Payload } from './DFWhole';
44

5-
// Fixture generated by Python's sd_to_parquet_b64() with a summary stats dict
6-
// containing numeric histogram data for one column.
5+
// Fixture generated by Python's sd_to_parquet_b64() with wide-column layout.
76
// eslint-disable-next-line @typescript-eslint/no-var-requires
87
const fixture = require('./test-fixtures/summary_stats_parquet_b64.json');
98
const parquetPayload: ParquetB64Payload = fixture as ParquetB64Payload;
@@ -29,9 +28,7 @@ describe('resolveDFData', () => {
2928
expect(resolveDFData(data)).toBe(data);
3029
});
3130

32-
it('hyparquet can read the parquet_b64 fixture', async () => {
33-
// Verify the fixture is valid and hyparquet can decode it.
34-
// This is independent of resolveDFData — it tests the raw decode path.
31+
it('hyparquet can read the wide-format parquet_b64 fixture', async () => {
3532
const buf = b64ToArrayBuffer(parquetPayload.data);
3633
const metadata = parquetMetadata(buf);
3734
expect(metadata.row_groups.length).toBeGreaterThan(0);
@@ -44,60 +41,124 @@ describe('resolveDFData', () => {
4441
onComplete: (data: any[]) => { rows.push(...data); },
4542
});
4643

47-
expect(rows.length).toBeGreaterThan(0);
44+
// Wide format: single row with col__stat columns
45+
expect(rows.length).toBe(1);
46+
const keys = Object.keys(rows[0]);
47+
expect(keys.some(k => k.includes('__'))).toBe(true);
48+
expect(keys).toContain('a__mean');
49+
expect(keys).toContain('b__dtype');
50+
});
4851

49-
// Should have an 'index' column with stat names
50-
const indices = rows.map(r => r.index).filter(Boolean);
51-
expect(indices).toContain('histogram');
52-
expect(indices).toContain('dtype');
52+
it('sync resolveDFData returns [] for parquet_b64 (known async limitation)', () => {
53+
const result = resolveDFData(parquetPayload);
54+
expect(result.length).toBe(0);
5355
});
5456

55-
it('parquet_b64 histogram data round-trips with correct types', async () => {
56-
// Decode the fixture and verify histogram arrays have the right structure.
57-
const buf = b64ToArrayBuffer(parquetPayload.data);
58-
const metadata = parquetMetadata(buf);
57+
it('async resolveDFDataAsync returns pivoted DFData for wide-format parquet', async () => {
58+
const result = await resolveDFDataAsync(parquetPayload);
59+
expect(result.length).toBeGreaterThan(0);
5960

60-
const rows: DFDataRow[] = [];
61-
await parquetRead({
62-
file: buf,
63-
metadata,
64-
rowFormat: 'object',
65-
onComplete: (data: any[]) => { rows.push(...data); },
66-
});
61+
// Should have row-based format with index column
62+
const meanRow = result.find(r => r.index === 'mean');
63+
expect(meanRow).toBeDefined();
64+
expect(meanRow!.a).toBe(50.0);
65+
expect(meanRow!.b).toBe(22.0);
6766

68-
const histRow = rows.find(r => r.index === 'histogram');
69-
expect(histRow).toBeDefined();
67+
const dtypeRow = result.find(r => r.index === 'dtype');
68+
expect(dtypeRow).toBeDefined();
69+
expect(dtypeRow!.a).toBe('float64');
70+
expect(dtypeRow!.b).toBe('int64');
71+
});
7072

71-
// Column 'a' contains the JSON-encoded histogram array
72-
const rawCell = histRow!['a'];
73-
expect(typeof rawCell).toBe('string');
73+
it('async decode produces histogram arrays from JSON strings', async () => {
74+
const result = await resolveDFDataAsync(parquetPayload);
7475

75-
const parsed = JSON.parse(rawCell as string);
76-
expect(Array.isArray(parsed)).toBe(true);
77-
expect(parsed.length).toBeGreaterThan(0);
76+
const histRow = result.find(r => r.index === 'histogram');
77+
expect(histRow).toBeDefined();
78+
expect(Array.isArray(histRow!.a)).toBe(true);
79+
const hist = histRow!.a as any[];
80+
expect(hist.length).toBe(5);
81+
expect(typeof hist[0].population).toBe('number');
82+
expect(hist[0].name).toBe('0-20');
83+
});
7884

79-
// Verify types: population should be a number, not a string
80-
const popBar = parsed.find((b: any) => b.population !== undefined);
81-
expect(popBar).toBeDefined();
82-
expect(typeof popBar.population).toBe('number');
83-
expect(typeof parsed[0].name).toBe('string');
85+
it('async decode produces histogram_bins arrays', async () => {
86+
const result = await resolveDFDataAsync(parquetPayload);
87+
88+
const binsRow = result.find(r => r.index === 'histogram_bins');
89+
expect(binsRow).toBeDefined();
90+
expect(Array.isArray(binsRow!.a)).toBe(true);
91+
expect((binsRow!.a as number[]).length).toBe(6);
8492
});
93+
});
8594

86-
it('sync resolveDFData returns [] for parquet_b64 (known async limitation)', () => {
87-
// Documents #630: parquetRead is async so the sync wrapper returns [].
88-
// Widget components use useResolvedDFDataDict which falls back to async.
89-
// The static embed path uses resolveDFDataAsync which works correctly.
90-
const result = resolveDFData(parquetPayload);
91-
expect(result.length).toBe(0);
95+
describe('pivotWideSummaryStats', () => {
96+
it('pivots a wide row into row-based DFData', () => {
97+
const wideRow = {
98+
a__mean: 42.5,
99+
a__dtype: 'float64',
100+
b__mean: 10.0,
101+
b__dtype: 'int64',
102+
};
103+
const result = pivotWideSummaryStats(wideRow);
104+
105+
const meanRow = result.find(r => r.index === 'mean');
106+
expect(meanRow).toBeDefined();
107+
expect(meanRow!.a).toBe(42.5);
108+
expect(meanRow!.b).toBe(10.0);
109+
expect(meanRow!.level_0).toBe('mean');
110+
111+
const dtypeRow = result.find(r => r.index === 'dtype');
112+
expect(dtypeRow).toBeDefined();
113+
expect(dtypeRow!.a).toBe('float64');
114+
expect(dtypeRow!.b).toBe('int64');
92115
});
93116

94-
it('async resolveDFDataAsync returns non-empty result for parquet_b64', async () => {
95-
const result = await resolveDFDataAsync(parquetPayload);
96-
expect(result.length).toBeGreaterThan(0);
117+
it('JSON-parses list/object values in string cells', () => {
118+
const wideRow = {
119+
a__histogram: '[{"name": "foo", "population": 10}]',
120+
a__dtype: 'float64',
121+
};
122+
const result = pivotWideSummaryStats(wideRow);
97123

98-
// Verify the histogram row was JSON-parsed correctly
99124
const histRow = result.find(r => r.index === 'histogram');
100125
expect(histRow).toBeDefined();
101-
expect(Array.isArray(histRow!['a'])).toBe(true);
126+
expect(Array.isArray(histRow!.a)).toBe(true);
127+
expect((histRow!.a as any[])[0].population).toBe(10);
128+
});
129+
130+
it('keeps plain strings as strings (not JSON-parsed)', () => {
131+
const wideRow = {
132+
a__dtype: 'float64',
133+
};
134+
const result = pivotWideSummaryStats(wideRow);
135+
const row = result.find(r => r.index === 'dtype');
136+
expect(row!.a).toBe('float64');
137+
});
138+
139+
it('handles null values', () => {
140+
const wideRow = {
141+
a__mean: null,
142+
a__dtype: 'float64',
143+
};
144+
const result = pivotWideSummaryStats(wideRow);
145+
const meanRow = result.find(r => r.index === 'mean');
146+
expect(meanRow!.a).toBeNull();
147+
});
148+
149+
it('fills missing columns with null', () => {
150+
const wideRow = {
151+
a__mean: 42,
152+
b__dtype: 'int64',
153+
};
154+
const result = pivotWideSummaryStats(wideRow);
155+
156+
const meanRow = result.find(r => r.index === 'mean');
157+
expect(meanRow!.a).toBe(42);
158+
expect(meanRow!.b).toBeNull();
159+
160+
const dtypeRow = result.find(r => r.index === 'dtype');
161+
expect(dtypeRow!.a).toBeNull();
162+
expect(dtypeRow!.b).toBe('int64');
102163
});
103164
});

packages/buckaroo-js-core/src/components/DFViewerParts/resolveDFData.ts

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -41,38 +41,68 @@ function b64ToArrayBuffer(b64: string): ArrayBuffer {
4141
}
4242

4343
/**
44-
* JSON-parse each cell value in a row from parquet-decoded data.
44+
* Pivot a wide single-row parquet result (col__stat columns) back to
45+
* row-based DFData that downstream consumers expect.
4546
*
46-
* The Python side JSON-encodes every cell before writing to parquet
47-
* (because summary stats have mixed types per column). We need to
48-
* JSON.parse each value back to its original type.
49-
*
50-
* The 'index' column is left as a plain string (stat name like 'mean', 'dtype').
47+
* Input: single row object like {a__mean: 42, a__dtype: "float64", b__mean: 10, ...}
48+
* Output: DFData rows like [{index: "mean", level_0: "mean", a: 42, b: 10}, ...]
5149
*/
52-
function parseParquetRow(row: Record<string, any>): DFDataRow {
53-
const parsed: DFDataRow = {};
54-
for (const [key, val] of Object.entries(row)) {
55-
if (key === 'index' || key === 'level_0') {
56-
// index/level_0 columns are stat names — keep as-is
57-
// BigInt from hyparquet INT64 columns must be converted to Number
58-
parsed[key] = typeof val === 'bigint' ? Number(val) : val;
59-
} else if (typeof val === 'string') {
50+
export function pivotWideSummaryStats(wideRow: Record<string, any>): DFData {
51+
// Group values by stat name: stat -> {col -> value}
52+
const statCols: Record<string, Record<string, any>> = {};
53+
const allCols = new Set<string>();
54+
55+
for (const [key, val] of Object.entries(wideRow)) {
56+
const sepIdx = key.indexOf('__');
57+
if (sepIdx === -1) continue;
58+
const col = key.substring(0, sepIdx);
59+
const stat = key.substring(sepIdx + 2);
60+
allCols.add(col);
61+
if (!statCols[stat]) statCols[stat] = {};
62+
63+
// JSON-parse string values that are JSON arrays/objects
64+
if (typeof val === 'string') {
6065
try {
61-
parsed[key] = JSON.parse(val);
66+
const parsed = JSON.parse(val);
67+
if (typeof parsed === 'object' && parsed !== null) {
68+
statCols[stat][col] = parsed;
69+
continue;
70+
}
6271
} catch {
63-
parsed[key] = val;
72+
// not JSON, keep as string
6473
}
65-
} else if (typeof val === 'bigint') {
66-
// hyparquet decodes INT64 as BigInt; use Number only if safe,
67-
// otherwise stringify to preserve precision (fixes #627)
74+
}
75+
// BigInt conversion (hyparquet INT64)
76+
if (typeof val === 'bigint') {
6877
const MAX_SAFE = BigInt(Number.MAX_SAFE_INTEGER);
69-
parsed[key] = val >= -MAX_SAFE && val <= MAX_SAFE
78+
statCols[stat][col] = val >= -MAX_SAFE && val <= MAX_SAFE
7079
? Number(val) : String(val);
71-
} else {
72-
parsed[key] = val;
80+
continue;
81+
}
82+
statCols[stat][col] = val;
83+
}
84+
85+
// Build DFData: one row per stat
86+
const colList = Array.from(allCols);
87+
const rows: DFData = [];
88+
for (const [stat, cols] of Object.entries(statCols)) {
89+
const row: DFDataRow = { index: stat, level_0: stat };
90+
for (let i = 0; i < colList.length; i++) {
91+
const col = colList[i];
92+
row[col] = cols[col] ?? null;
7393
}
94+
rows.push(row);
7495
}
75-
return parsed;
96+
return rows;
97+
}
98+
99+
/**
100+
* Detect wide-column format: single row where column names contain '__'.
101+
*/
102+
function isWideFormat(rows: any[]): boolean {
103+
if (rows.length !== 1) return false;
104+
const keys = Object.keys(rows[0]);
105+
return keys.some(k => k.indexOf('__') !== -1);
76106
}
77107

78108
/**
@@ -106,8 +136,11 @@ export function resolveDFData(val: DFDataOrPayload | undefined | null): DFData {
106136
metadata,
107137
rowFormat: 'object',
108138
onComplete: (data: any[]) => {
109-
// JSON-parse each cell to recover typed values
110-
result = (data as DFDataRow[]).map(parseParquetRow);
139+
if (isWideFormat(data)) {
140+
result = pivotWideSummaryStats(data[0] as Record<string, any>);
141+
} else {
142+
result = data as DFData;
143+
}
111144
cacheSet(val.data, result);
112145
},
113146
});
@@ -156,7 +189,12 @@ export async function resolveDFDataAsync(val: DFDataOrPayload | undefined | null
156189
reject(e);
157190
}
158191
});
159-
const result = (data as DFDataRow[]).map(parseParquetRow);
192+
let result: DFData;
193+
if (isWideFormat(data)) {
194+
result = pivotWideSummaryStats(data[0] as Record<string, any>);
195+
} else {
196+
result = data as DFData;
197+
}
160198
cacheSet(val.data, result);
161199
return result;
162200
} catch (e) {

0 commit comments

Comments
 (0)