Skip to content

Commit 512928e

Browse files
paddymulclaude
andauthored
fix: display Duration and weird-type columns (#622) (#635)
* fix: display Duration and weird-type columns instead of blank cells (#622) Add type detection (duration, categorical, period, interval, binary) to v1 TypingStats, JS DurationDisplayer for human-readable formatting, serialization fixes for period/interval/timedelta/bytes columns, and partial-stats error recovery so non-critical failures don't blank the entire stats row. Pandas end-to-end support is complete; Polars type detection deferred to the v2 stats pipeline PR. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: skip histogram assertion for DFViewer integration tests DFViewer/PolarsDFViewer don't run the analysis pipeline, so they never produce .histogram-component elements. The unconditional assertion added in f2147ce causes these notebooks to fail. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: add type detection for weird types in both v1 and v2 stats pipelines Add is_timedelta, is_categorical, is_period, is_interval, is_time, is_decimal, is_binary flags to pd_stats_v2 and pl_stats_v2 typing_stats functions so Duration, Time, Categorical, Decimal, and Binary columns get the correct _type and displayer mapping. Without this, the Polars pipeline misclassifies Duration as "datetime" and renders blank cells. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: parse pandas timedelta strings in duration formatter formatDuration now handles both ISO 8601 ("P1DT2H3M4.5S") and pandas timedelta repr ("1 days 02:03:04") formats. The serializer produces pandas-style strings via astype(str), so without this the formatter was falling back to the raw string for every real duration value. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ac93de6 commit 512928e

19 files changed

Lines changed: 1277 additions & 23 deletions

File tree

buckaroo/customizations/analysis.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,10 @@ class TypingStats(ColAnalysis):
7676

7777
provides_defaults = {
7878
'dtype':'asdf', 'is_numeric':False, 'is_integer':False,
79-
'is_datetime':False, 'is_bool':False, 'is_float':False, '_type':'asdf'}
79+
'is_datetime':False, 'is_timedelta':False, 'is_bool':False,
80+
'is_float':False, 'is_categorical':False, 'is_period':False,
81+
'is_interval':False, 'is_time':False, 'is_decimal':False,
82+
'is_binary':False, '_type':'asdf'}
8083

8184
@staticmethod
8285
def series_summary(sampled_ser, ser):
@@ -85,9 +88,13 @@ def series_summary(sampled_ser, ser):
8588
is_numeric=pd.api.types.is_numeric_dtype(ser),
8689
is_integer=pd.api.types.is_integer_dtype(ser),
8790
is_datetime=pd.api.types.is_datetime64_any_dtype(ser),
91+
is_timedelta=pd.api.types.is_timedelta64_dtype(ser),
8892
is_bool=pd.api.types.is_bool_dtype(ser),
8993
is_float=pd.api.types.is_float_dtype(ser),
9094
is_string=pd.api.types.is_string_dtype(ser),
95+
is_categorical=isinstance(ser.dtype, pd.CategoricalDtype),
96+
is_period=isinstance(ser.dtype, pd.PeriodDtype),
97+
is_interval=isinstance(ser.dtype, pd.IntervalDtype),
9198
memory_usage=ser.memory_usage())
9299

93100
@staticmethod
@@ -100,9 +107,16 @@ def computed_summary(summary_dict):
100107
_type = "float"
101108
else:
102109
_type = "integer"
103-
#elif pd.api.types.is_datetime64_any_dtype(ser):
110+
elif summary_dict.get('is_timedelta'):
111+
_type = 'duration'
104112
elif summary_dict['is_datetime']:
105113
_type = 'datetime'
114+
elif summary_dict.get('is_categorical'):
115+
_type = 'categorical'
116+
elif summary_dict.get('is_period'):
117+
_type = 'period'
118+
elif summary_dict.get('is_interval'):
119+
_type = 'interval'
106120
elif summary_dict['is_string']:
107121
_type = "string"
108122
return dict(_type=_type)

buckaroo/customizations/pd_stats_v2.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,16 @@ def orig_col_name(ser: RawSeries) -> Any:
6262
'is_numeric': bool,
6363
'is_integer': bool,
6464
'is_datetime': bool,
65+
'is_timedelta': bool,
6566
'is_bool': bool,
6667
'is_float': bool,
6768
'is_string': bool,
69+
'is_categorical': bool,
70+
'is_period': bool,
71+
'is_interval': bool,
72+
'is_time': bool,
73+
'is_decimal': bool,
74+
'is_binary': bool,
6875
'memory_usage': int,
6976
})
7077

@@ -77,25 +84,48 @@ def typing_stats(ser: RawSeries) -> TypingResult:
7784
'is_numeric': pd.api.types.is_numeric_dtype(ser),
7885
'is_integer': pd.api.types.is_integer_dtype(ser),
7986
'is_datetime': pd.api.types.is_datetime64_any_dtype(ser),
87+
'is_timedelta': pd.api.types.is_timedelta64_dtype(ser),
8088
'is_bool': pd.api.types.is_bool_dtype(ser),
8189
'is_float': pd.api.types.is_float_dtype(ser),
8290
'is_string': pd.api.types.is_string_dtype(ser),
91+
'is_categorical': isinstance(ser.dtype, pd.CategoricalDtype),
92+
'is_period': isinstance(ser.dtype, pd.PeriodDtype),
93+
'is_interval': isinstance(ser.dtype, pd.IntervalDtype),
94+
'is_time': False,
95+
'is_decimal': False,
96+
'is_binary': False,
8397
'memory_usage': ser.memory_usage(),
8498
}
8599

86100

87101
@stat()
88102
def _type(is_bool: bool, is_numeric: bool, is_float: bool,
89-
is_datetime: bool, is_string: bool) -> str:
103+
is_datetime: bool, is_timedelta: bool, is_string: bool,
104+
is_categorical: bool, is_period: bool, is_interval: bool,
105+
is_time: bool, is_decimal: bool, is_binary: bool) -> str:
90106
"""Derive the human-readable column type string."""
91107
if is_bool:
92108
return "boolean"
109+
elif is_decimal:
110+
return "decimal"
93111
elif is_numeric:
94112
if is_float:
95113
return "float"
96114
return "integer"
115+
elif is_timedelta:
116+
return "duration"
117+
elif is_time:
118+
return "time"
97119
elif is_datetime:
98120
return "datetime"
121+
elif is_categorical:
122+
return "categorical"
123+
elif is_period:
124+
return "period"
125+
elif is_interval:
126+
return "interval"
127+
elif is_binary:
128+
return "binary"
99129
elif is_string:
100130
return "string"
101131
return "obj"

buckaroo/customizations/pl_stats_v2.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,14 @@ def pl_orig_col_name(ser: RawSeries) -> Any:
5252
'is_float': bool,
5353
'is_bool': bool,
5454
'is_datetime': bool,
55+
'is_timedelta': bool,
5556
'is_string': bool,
57+
'is_categorical': bool,
58+
'is_period': bool,
59+
'is_interval': bool,
60+
'is_time': bool,
61+
'is_decimal': bool,
62+
'is_binary': bool,
5663
'memory_usage': int,
5764
})
5865

@@ -63,12 +70,19 @@ def pl_typing_stats(ser: RawSeries) -> PlTypingResult:
6370
dt = ser.dtype
6471
return {
6572
'dtype': str(dt),
66-
'is_numeric': dt.is_numeric(),
73+
'is_numeric': dt.is_numeric() and dt.base_type() is not pl.Decimal,
6774
'is_integer': dt.is_integer(),
6875
'is_float': dt.is_float(),
6976
'is_bool': dt == pl.Boolean,
70-
'is_datetime': dt.is_temporal(),
77+
'is_datetime': dt.is_temporal() and dt not in (pl.Duration, pl.Time),
78+
'is_timedelta': dt == pl.Duration,
7179
'is_string': dt in (pl.Utf8, pl.String),
80+
'is_categorical': dt == pl.Categorical or isinstance(dt, pl.Enum),
81+
'is_period': False,
82+
'is_interval': False,
83+
'is_time': dt == pl.Time,
84+
'is_decimal': dt.base_type() is pl.Decimal,
85+
'is_binary': dt == pl.Binary,
7286
'memory_usage': ser.estimated_size(),
7387
}
7488

buckaroo/customizations/styling.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ def _formatted_char_count(displayer_args, column_metadata):
4141
if d in ('datetimeLocaleString', 'datetimeDefault'):
4242
return 18 # "12/31/2024, 11:59 PM"
4343

44+
if d == 'duration':
45+
return 14 # e.g. "365d 23h 59m 59s"
46+
4447
return 8 # obj / fallback
4548

4649

@@ -79,6 +82,16 @@ def style_column(kls, col:str, column_metadata: Any) -> Any:
7982
disp = {'displayer': 'float', 'min_fraction_digits':digits, 'max_fraction_digits':digits}
8083
elif t == 'datetime':
8184
disp = {'displayer': 'datetimeLocaleString','locale': 'en-US', 'args': {}}
85+
elif t == 'decimal':
86+
disp = {'displayer': 'float', 'min_fraction_digits':digits, 'max_fraction_digits':digits}
87+
elif t == 'duration':
88+
disp = {'displayer': 'duration'}
89+
elif t in ('time', 'categorical', 'period', 'interval'):
90+
disp = {'displayer': 'string', 'max_length': 35}
91+
base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': str(col)}
92+
elif t == 'binary':
93+
disp = {'displayer': 'obj'}
94+
base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': str(col)}
8295
elif t == 'string':
8396
disp = {'displayer': 'string', 'max_length': 35}
8497
base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': str(col)}

buckaroo/dataflow/dataflow.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,9 @@ def _get_summary_sd(self, processed_df:pd.DataFrame) -> Tuple[SDType, TDict[str,
408408
if self.debug:
409409
raise Exception("Error executing analysis")
410410
else:
411-
return {}, stats.errs
411+
# Return partial results — non-critical errors (e.g. histogram
412+
# failure for Decimal) shouldn't discard all stats
413+
return sdf, stats.errs
412414
else:
413415
return sdf, {}
414416

buckaroo/ddd_library.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,48 @@ def get_df_with_named_index() -> pd.DataFrame:
154154
index=pd.Index([10,20,30,40,50], name='foo'))
155155

156156

157+
def df_with_weird_types() -> pd.DataFrame:
158+
"""DataFrame with unusual dtypes that historically broke rendering.
159+
160+
Exercises: categorical, timedelta, period, interval.
161+
"""
162+
return pd.DataFrame({
163+
'categorical': pd.Categorical(['red', 'green', 'blue', 'red', 'green']),
164+
'timedelta': pd.to_timedelta(['1 days 02:03:04', '0 days 00:00:01',
165+
'365 days', '0 days 00:00:00.001',
166+
'0 days 00:00:00.000100']),
167+
'period': pd.Series(pd.period_range('2021-01', periods=5, freq='M')),
168+
'interval': pd.Series(pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3, 4, 5])),
169+
'int_col': [10, 20, 30, 40, 50],
170+
})
171+
172+
173+
def pl_df_with_weird_types():
174+
"""Polars DataFrame with unusual dtypes that historically broke rendering.
175+
176+
Exercises: Duration (issue #622), Time, Categorical, Decimal, Binary.
177+
Must be displayed with PolarsBuckarooWidget, not the default pandas widget.
178+
"""
179+
import datetime as dt
180+
import polars as pl
181+
return pl.DataFrame({
182+
'duration': pl.Series([100_000, 3_723_000_000, 86_400_000_000,
183+
500, 60_000_000], dtype=pl.Duration('us')),
184+
'time': [dt.time(14, 30), dt.time(9, 15, 30),
185+
dt.time(0, 0, 1), dt.time(23, 59, 59), dt.time(12, 0)],
186+
'categorical': pl.Series(['red', 'green', 'blue', 'red', 'green']).cast(pl.Categorical),
187+
'decimal': pl.Series(['100.50', '200.75', '0.01',
188+
'99999.99', '3.14']).cast(pl.Decimal(10, 2)),
189+
'binary': [b'hello', b'world', b'\x00\x01\x02', b'test', b'\xff\xfe'],
190+
'int_col': [10, 20, 30, 40, 50],
191+
})
192+
193+
194+
def pl_df_with_weird_types_as_pandas():
195+
"""Polars weird types converted to pandas for use with pandas-based widgets."""
196+
return pl_df_with_weird_types().to_pandas()
197+
198+
157199
"""
158200
Mkae a duplicate column dataframe
159201

buckaroo/pluggable_analysis_framework/safe_summary_df.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ def output_full_reproduce(errs, summary_df, df_name):
9696
try:
9797
for ser_name, err_kls in errs.items():
9898
err, kls = err_kls
99+
if kls is None:
100+
# v2 stat functions don't have a v1 ColAnalysis class
101+
col, stat = ser_name if isinstance(ser_name, tuple) else (ser_name, '?')
102+
print(f"# {col}:{stat}{err}")
103+
continue
99104
reproduce_summary(ser_name, kls, summary_df, err, df_name)
100105
except Exception:
101106
#this is tricky stuff that shouldn't error, I want these stack traces to escape being caught

buckaroo/serialization_utils.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,25 @@ def force_to_pandas(df_pd_or_pl) -> pd.DataFrame:
118118

119119

120120

121+
def _coerce_for_json(df: pd.DataFrame) -> pd.DataFrame:
122+
"""Convert columns with types that pd.to_json can't handle."""
123+
for col in df.columns:
124+
dtype = df[col].dtype
125+
if isinstance(dtype, (pd.PeriodDtype, pd.IntervalDtype)):
126+
df[col] = df[col].astype(str)
127+
elif pd.api.types.is_timedelta64_dtype(dtype):
128+
df[col] = df[col].astype(str)
129+
elif dtype == object: # noqa: E721 — np.dtype('O') != builtin object via `is`
130+
# Check if any values are raw bytes (e.g. from pl.Binary)
131+
sample = df[col].dropna().head(1)
132+
if len(sample) > 0 and isinstance(sample.iloc[0], bytes):
133+
df[col] = df[col].apply(lambda x: x.hex() if isinstance(x, bytes) else x)
134+
return df
135+
136+
121137
def pd_to_obj(df:pd.DataFrame) -> Dict[str, Any]:
122138
df2 = prepare_df_for_serialization(df)
139+
df2 = _coerce_for_json(df2)
123140
# Add level_0 for JSON serialization to maintain backwards compatibility
124141
# This is only needed for JSON, not for Parquet serialization
125142
if not isinstance(df.index, pd.MultiIndex):
@@ -194,6 +211,18 @@ def to_parquet(df):
194211
if pd.api.types.is_string_dtype(df2[col].dtype) and not pd.api.types.is_object_dtype(df2[col].dtype):
195212
df2[col] = df2[col].astype('object')
196213

214+
# Convert dtypes that fastparquet can't handle to string/object
215+
for col in df2.columns:
216+
dtype = df2[col].dtype
217+
if isinstance(dtype, (pd.PeriodDtype, pd.IntervalDtype)):
218+
df2[col] = df2[col].astype(str)
219+
elif pd.api.types.is_timedelta64_dtype(dtype):
220+
df2[col] = df2[col].astype(str)
221+
elif dtype == object: # noqa: E721 — np.dtype('O') != builtin object via `is`
222+
sample = df2[col].dropna().head(1)
223+
if len(sample) > 0 and isinstance(sample.iloc[0], bytes):
224+
df2[col] = df2[col].apply(lambda x: x.hex() if isinstance(x, bytes) else x)
225+
197226
obj_columns = df2.select_dtypes([pd.CategoricalDtype(), 'object']).columns.to_list()
198227
encodings = {k:'json' for k in obj_columns}
199228

0 commit comments

Comments
 (0)