buckaroo-data
diff --git a/‎buckaroo/customizations/analysis.py‎
Lines changed: 16 additions & 2 deletions b/‎buckaroo/customizations/analysis.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎buckaroo/customizations/pd_stats_v2.py‎
Lines changed: 31 additions & 1 deletion b/‎buckaroo/customizations/pd_stats_v2.py‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎buckaroo/customizations/pl_stats_v2.py‎
Lines changed: 16 additions & 2 deletions b/‎buckaroo/customizations/pl_stats_v2.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎buckaroo/customizations/styling.py‎
Lines changed: 13 additions & 0 deletions b/‎buckaroo/customizations/styling.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎buckaroo/dataflow/dataflow.py‎
Lines changed: 3 additions & 1 deletion b/‎buckaroo/dataflow/dataflow.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎buckaroo/ddd_library.py‎
Lines changed: 42 additions & 0 deletions b/‎buckaroo/ddd_library.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎buckaroo/pluggable_analysis_framework/safe_summary_df.py‎
Lines changed: 5 additions & 0 deletions b/‎buckaroo/pluggable_analysis_framework/safe_summary_df.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎buckaroo/serialization_utils.py‎
Lines changed: 29 additions & 0 deletions b/‎buckaroo/serialization_utils.py‎
Lines changed: 29 additions & 0 deletions
@@ -76,7 +76,10 @@ class TypingStats(ColAnalysis):
 
     provides_defaults = {
         'dtype':'asdf', 'is_numeric':False, 'is_integer':False,
-        'is_datetime':False, 'is_bool':False, 'is_float':False, '_type':'asdf'}
+        'is_datetime':False, 'is_timedelta':False, 'is_bool':False,
+        'is_float':False, 'is_categorical':False, 'is_period':False,
+        'is_interval':False, 'is_time':False, 'is_decimal':False,
+        'is_binary':False, '_type':'asdf'}
 
     @staticmethod
     def series_summary(sampled_ser, ser):
@@ -85,9 +88,13 @@ def series_summary(sampled_ser, ser):
             is_numeric=pd.api.types.is_numeric_dtype(ser),
             is_integer=pd.api.types.is_integer_dtype(ser),
             is_datetime=pd.api.types.is_datetime64_any_dtype(ser),
+            is_timedelta=pd.api.types.is_timedelta64_dtype(ser),
             is_bool=pd.api.types.is_bool_dtype(ser),
             is_float=pd.api.types.is_float_dtype(ser),
             is_string=pd.api.types.is_string_dtype(ser),
+            is_categorical=isinstance(ser.dtype, pd.CategoricalDtype),
+            is_period=isinstance(ser.dtype, pd.PeriodDtype),
+            is_interval=isinstance(ser.dtype, pd.IntervalDtype),
             memory_usage=ser.memory_usage())
 
     @staticmethod
@@ -100,9 +107,16 @@ def computed_summary(summary_dict):
                 _type = "float"
             else:
                 _type = "integer"
-        #elif pd.api.types.is_datetime64_any_dtype(ser):
+        elif summary_dict.get('is_timedelta'):
+            _type = 'duration'
         elif summary_dict['is_datetime']:
             _type = 'datetime'
+        elif summary_dict.get('is_categorical'):
+            _type = 'categorical'
+        elif summary_dict.get('is_period'):
+            _type = 'period'
+        elif summary_dict.get('is_interval'):
+            _type = 'interval'
         elif summary_dict['is_string']:
             _type = "string"
         return dict(_type=_type)
 
@@ -62,9 +62,16 @@ def orig_col_name(ser: RawSeries) -> Any:
     'is_numeric': bool,
     'is_integer': bool,
     'is_datetime': bool,
+    'is_timedelta': bool,
     'is_bool': bool,
     'is_float': bool,
     'is_string': bool,
+    'is_categorical': bool,
+    'is_period': bool,
+    'is_interval': bool,
+    'is_time': bool,
+    'is_decimal': bool,
+    'is_binary': bool,
     'memory_usage': int,
 })
 
@@ -77,25 +84,48 @@ def typing_stats(ser: RawSeries) -> TypingResult:
         'is_numeric': pd.api.types.is_numeric_dtype(ser),
         'is_integer': pd.api.types.is_integer_dtype(ser),
         'is_datetime': pd.api.types.is_datetime64_any_dtype(ser),
+        'is_timedelta': pd.api.types.is_timedelta64_dtype(ser),
         'is_bool': pd.api.types.is_bool_dtype(ser),
         'is_float': pd.api.types.is_float_dtype(ser),
         'is_string': pd.api.types.is_string_dtype(ser),
+        'is_categorical': isinstance(ser.dtype, pd.CategoricalDtype),
+        'is_period': isinstance(ser.dtype, pd.PeriodDtype),
+        'is_interval': isinstance(ser.dtype, pd.IntervalDtype),
+        'is_time': False,
+        'is_decimal': False,
+        'is_binary': False,
         'memory_usage': ser.memory_usage(),
     }
 
 
 @stat()
 def _type(is_bool: bool, is_numeric: bool, is_float: bool,
-          is_datetime: bool, is_string: bool) -> str:
+          is_datetime: bool, is_timedelta: bool, is_string: bool,
+          is_categorical: bool, is_period: bool, is_interval: bool,
+          is_time: bool, is_decimal: bool, is_binary: bool) -> str:
     """Derive the human-readable column type string."""
     if is_bool:
         return "boolean"
+    elif is_decimal:
+        return "decimal"
     elif is_numeric:
         if is_float:
             return "float"
         return "integer"
+    elif is_timedelta:
+        return "duration"
+    elif is_time:
+        return "time"
     elif is_datetime:
         return "datetime"
+    elif is_categorical:
+        return "categorical"
+    elif is_period:
+        return "period"
+    elif is_interval:
+        return "interval"
+    elif is_binary:
+        return "binary"
     elif is_string:
         return "string"
     return "obj"
 
@@ -52,7 +52,14 @@ def pl_orig_col_name(ser: RawSeries) -> Any:
     'is_float': bool,
     'is_bool': bool,
     'is_datetime': bool,
+    'is_timedelta': bool,
     'is_string': bool,
+    'is_categorical': bool,
+    'is_period': bool,
+    'is_interval': bool,
+    'is_time': bool,
+    'is_decimal': bool,
+    'is_binary': bool,
     'memory_usage': int,
 })
 
@@ -63,12 +70,19 @@ def pl_typing_stats(ser: RawSeries) -> PlTypingResult:
     dt = ser.dtype
     return {
         'dtype': str(dt),
-        'is_numeric': dt.is_numeric(),
+        'is_numeric': dt.is_numeric() and dt.base_type() is not pl.Decimal,
         'is_integer': dt.is_integer(),
         'is_float': dt.is_float(),
         'is_bool': dt == pl.Boolean,
-        'is_datetime': dt.is_temporal(),
+        'is_datetime': dt.is_temporal() and dt not in (pl.Duration, pl.Time),
+        'is_timedelta': dt == pl.Duration,
         'is_string': dt in (pl.Utf8, pl.String),
+        'is_categorical': dt == pl.Categorical or isinstance(dt, pl.Enum),
+        'is_period': False,
+        'is_interval': False,
+        'is_time': dt == pl.Time,
+        'is_decimal': dt.base_type() is pl.Decimal,
+        'is_binary': dt == pl.Binary,
         'memory_usage': ser.estimated_size(),
     }
 
 
@@ -41,6 +41,9 @@ def _formatted_char_count(displayer_args, column_metadata):
     if d in ('datetimeLocaleString', 'datetimeDefault'):
         return 18  # "12/31/2024, 11:59 PM"
 
+    if d == 'duration':
+        return 14  # e.g. "365d 23h 59m 59s"
+
     return 8  # obj / fallback
 
 
@@ -79,6 +82,16 @@ def style_column(kls, col:str, column_metadata: Any) -> Any:
             disp = {'displayer': 'float', 'min_fraction_digits':digits, 'max_fraction_digits':digits}
         elif t == 'datetime':
             disp = {'displayer': 'datetimeLocaleString','locale': 'en-US',  'args': {}}
+        elif t == 'decimal':
+            disp = {'displayer': 'float', 'min_fraction_digits':digits, 'max_fraction_digits':digits}
+        elif t == 'duration':
+            disp = {'displayer': 'duration'}
+        elif t in ('time', 'categorical', 'period', 'interval'):
+            disp = {'displayer': 'string', 'max_length': 35}
+            base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': str(col)}
+        elif t == 'binary':
+            disp = {'displayer': 'obj'}
+            base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': str(col)}
         elif t == 'string':
             disp = {'displayer': 'string', 'max_length': 35}
             base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': str(col)}
 
@@ -408,7 +408,9 @@ def _get_summary_sd(self, processed_df:pd.DataFrame) -> Tuple[SDType, TDict[str,
             if self.debug:
                 raise Exception("Error executing analysis")
             else:
-                return {}, stats.errs
+                # Return partial results — non-critical errors (e.g. histogram
+                # failure for Decimal) shouldn't discard all stats
+                return sdf, stats.errs
         else:
             return sdf, {}
 
 
@@ -154,6 +154,48 @@ def get_df_with_named_index() -> pd.DataFrame:
                         index=pd.Index([10,20,30,40,50], name='foo'))
 
 
+def df_with_weird_types() -> pd.DataFrame:
+    """DataFrame with unusual dtypes that historically broke rendering.
+
+    Exercises: categorical, timedelta, period, interval.
+    """
+    return pd.DataFrame({
+        'categorical': pd.Categorical(['red', 'green', 'blue', 'red', 'green']),
+        'timedelta': pd.to_timedelta(['1 days 02:03:04', '0 days 00:00:01',
+                                       '365 days', '0 days 00:00:00.001',
+                                       '0 days 00:00:00.000100']),
+        'period': pd.Series(pd.period_range('2021-01', periods=5, freq='M')),
+        'interval': pd.Series(pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3, 4, 5])),
+        'int_col': [10, 20, 30, 40, 50],
+    })
+
+
+def pl_df_with_weird_types():
+    """Polars DataFrame with unusual dtypes that historically broke rendering.
+
+    Exercises: Duration (issue #622), Time, Categorical, Decimal, Binary.
+    Must be displayed with PolarsBuckarooWidget, not the default pandas widget.
+    """
+    import datetime as dt
+    import polars as pl
+    return pl.DataFrame({
+        'duration': pl.Series([100_000, 3_723_000_000, 86_400_000_000,
+                               500, 60_000_000], dtype=pl.Duration('us')),
+        'time': [dt.time(14, 30), dt.time(9, 15, 30),
+                 dt.time(0, 0, 1), dt.time(23, 59, 59), dt.time(12, 0)],
+        'categorical': pl.Series(['red', 'green', 'blue', 'red', 'green']).cast(pl.Categorical),
+        'decimal': pl.Series(['100.50', '200.75', '0.01',
+                              '99999.99', '3.14']).cast(pl.Decimal(10, 2)),
+        'binary': [b'hello', b'world', b'\x00\x01\x02', b'test', b'\xff\xfe'],
+        'int_col': [10, 20, 30, 40, 50],
+    })
+
+
+def pl_df_with_weird_types_as_pandas():
+    """Polars weird types converted to pandas for use with pandas-based widgets."""
+    return pl_df_with_weird_types().to_pandas()
+
+
 """
 Mkae a duplicate column dataframe
 
 
@@ -96,6 +96,11 @@ def output_full_reproduce(errs, summary_df, df_name):
     try:
         for ser_name, err_kls in errs.items():
             err, kls = err_kls
+            if kls is None:
+                # v2 stat functions don't have a v1 ColAnalysis class
+                col, stat = ser_name if isinstance(ser_name, tuple) else (ser_name, '?')
+                print(f"# {col}:{stat} — {err}")
+                continue
             reproduce_summary(ser_name, kls, summary_df, err, df_name)
     except Exception:
         #this is tricky stuff that shouldn't error, I want these stack traces to escape being caught
 
@@ -118,8 +118,25 @@ def force_to_pandas(df_pd_or_pl) -> pd.DataFrame:
 
 
 
+def _coerce_for_json(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert columns with types that pd.to_json can't handle."""
+    for col in df.columns:
+        dtype = df[col].dtype
+        if isinstance(dtype, (pd.PeriodDtype, pd.IntervalDtype)):
+            df[col] = df[col].astype(str)
+        elif pd.api.types.is_timedelta64_dtype(dtype):
+            df[col] = df[col].astype(str)
+        elif dtype == object:  # noqa: E721 — np.dtype('O') != builtin object via `is`
+            # Check if any values are raw bytes (e.g. from pl.Binary)
+            sample = df[col].dropna().head(1)
+            if len(sample) > 0 and isinstance(sample.iloc[0], bytes):
+                df[col] = df[col].apply(lambda x: x.hex() if isinstance(x, bytes) else x)
+    return df
+
+
 def pd_to_obj(df:pd.DataFrame) -> Dict[str, Any]:
     df2 = prepare_df_for_serialization(df)
+    df2 = _coerce_for_json(df2)
     # Add level_0 for JSON serialization to maintain backwards compatibility
     # This is only needed for JSON, not for Parquet serialization
     if not isinstance(df.index, pd.MultiIndex):
@@ -194,6 +211,18 @@ def to_parquet(df):
         if pd.api.types.is_string_dtype(df2[col].dtype) and not pd.api.types.is_object_dtype(df2[col].dtype):
             df2[col] = df2[col].astype('object')
 
+    # Convert dtypes that fastparquet can't handle to string/object
+    for col in df2.columns:
+        dtype = df2[col].dtype
+        if isinstance(dtype, (pd.PeriodDtype, pd.IntervalDtype)):
+            df2[col] = df2[col].astype(str)
+        elif pd.api.types.is_timedelta64_dtype(dtype):
+            df2[col] = df2[col].astype(str)
+        elif dtype == object:  # noqa: E721 — np.dtype('O') != builtin object via `is`
+            sample = df2[col].dropna().head(1)
+            if len(sample) > 0 and isinstance(sample.iloc[0], bytes):
+                df2[col] = df2[col].apply(lambda x: x.hex() if isinstance(x, bytes) else x)
+
     obj_columns = df2.select_dtypes([pd.CategoricalDtype(), 'object']).columns.to_list()
     encodings = {k:'json' for k in obj_columns}