Skip to content

Commit 1a9d469

Browse files
authored
feat: Add Polars eager DataFrame support (#82)
1 parent 3e85d8e commit 1a9d469

15 files changed

Lines changed: 801 additions & 42 deletions

deepnote_toolkit/chart/deepnote_chart.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
def _create_vf_runtime_for_dataframe(
2929
oc_df: oc.DataFrame, name: str
3030
) -> vegafusion.VegaFusionRuntime:
31-
if oc_df.native_type == "pandas":
31+
if oc_df.native_type in ("pandas", "polars-eager"):
3232
return vegafusion.VegaFusionRuntime()
3333
elif oc_df.native_type == "pyspark":
3434
spark_df = oc_df.to_native()
@@ -55,7 +55,7 @@ def spark_executor(sql_query: str) -> pa.Table:
5555

5656

5757
def _create_vf_inline_dataset_from_dataframe(oc_df: oc.DataFrame) -> Any:
58-
if oc_df.native_type == "pandas":
58+
if oc_df.native_type in ("pandas", "polars-eager"):
5959
return oc_df.to_native()
6060
elif oc_df.native_type == "pyspark":
6161
from pyspark.sql.pandas.types import to_arrow_schema
@@ -141,7 +141,11 @@ def __init__(
141141
if filtered_df.native_type == "pandas":
142142
sanitized_pandas = sanitize_dataframe_for_chart(filtered_df.to_native())
143143
oc_sanitized_df = oc.DataFrame.from_native(sanitized_pandas)
144-
elif filtered_df.native_type == "pyspark":
144+
elif filtered_df.native_type in ("pyspark", "polars-eager"):
145+
# We don't need to sanitize Spark DFs because they will processed by Spark itself and it can handle
146+
# all data types by itself
147+
# Polars is powered by Arrow, which is same format used internally by VegaFusion so there is no need
148+
# to do any additional sanitization for it either
145149
oc_sanitized_df = filtered_df
146150
else:
147151
raise TypeError(

deepnote_toolkit/dataframe_utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ def dataframe_formatter(native_df: oc.NativeInputDF):
102102
mimebundle_formatter.for_type_by_name(
103103
"pyspark.pandas.frame", "DataFrame", dataframe_formatter
104104
)
105+
mimebundle_formatter.for_type_by_name(
106+
"polars.dataframe.frame", "DataFrame", dataframe_formatter
107+
)
105108
logger.info("Attached mimebundle formatters")
106109

107110

deepnote_toolkit/ocelots/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .dataframe import (
88
DataFrame,
99
is_wrapped_pandas_dataframe,
10+
is_wrapped_polars_eager_dataframe,
1011
is_wrapped_pyspark_dataframe,
1112
)
1213
from .filters import Filter, FilterOperator
@@ -19,11 +20,13 @@
1920
NativeOutputType,
2021
PandasDF,
2122
PandasOnSparkDF,
23+
PolarsEagerDF,
2224
PysparkDF,
2325
UnsupportedDataFrameException,
2426
)
2527
from .utils import (
2628
is_pandas_dataframe,
2729
is_pandas_on_spark_dataframe,
30+
is_polars_eager_dataframe,
2831
is_pyspark_dataframe,
2932
)

deepnote_toolkit/ocelots/dataframe.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323

2424
from deepnote_toolkit.ocelots.filters import Filter
2525
from deepnote_toolkit.ocelots.pandas.implementation import PandasImplementation
26+
from deepnote_toolkit.ocelots.polars.implementation_eager import (
27+
PolarsEagerImplementation,
28+
)
2629
from deepnote_toolkit.ocelots.pyspark.implementation import PysparkImplementation
2730
from deepnote_toolkit.ocelots.types import (
2831
Column,
@@ -32,16 +35,20 @@
3235
NativeOutputType,
3336
PandasDF,
3437
PandasOnSparkDF,
38+
PolarsEagerDF,
3539
PysparkDF,
3640
UnsupportedDataFrameException,
3741
)
3842
from deepnote_toolkit.ocelots.utils import (
3943
is_pandas_dataframe,
4044
is_pandas_on_spark_dataframe,
45+
is_polars_eager_dataframe,
4146
is_pyspark_dataframe,
4247
)
4348

44-
Implementation = Union[PandasImplementation, PysparkImplementation]
49+
Implementation = Union[
50+
PandasImplementation, PysparkImplementation, PolarsEagerImplementation
51+
]
4552

4653
T = TypeVar("T", bound=NativeOutputDF)
4754
FromNativeT = TypeVar("FromNativeT", bound=NativeOutputDF)
@@ -86,6 +93,7 @@ def is_supported(cls, df: Any) -> bool:
8693
is_pandas_dataframe(df)
8794
or is_pyspark_dataframe(df)
8895
or is_pandas_on_spark_dataframe(df)
96+
or is_polars_eager_dataframe(df)
8997
)
9098

9199
# Special case for Pandas-on-Spark DFs, as they aren't wrapped directly, but converted
@@ -117,6 +125,8 @@ def from_native(cls, df: NativeInputDF):
117125
return cls(PandasImplementation(df))
118126
if is_pyspark_dataframe(df):
119127
return cls(PysparkImplementation(df))
128+
if is_polars_eager_dataframe(df):
129+
return cls(PolarsEagerImplementation(df))
120130
if is_pandas_on_spark_dataframe(df):
121131
# NOTE: we accept Pandas-on-Spark dataframes, but we convert them into Spark and
122132
# work like with it same as with normal Spark DF from that.
@@ -144,18 +154,27 @@ def from_native(cls, df: NativeInputDF):
144154
return cls(PysparkImplementation(df.to_spark()))
145155

146156
raise UnsupportedDataFrameException(
147-
f"expected Pandas or PySpark dataframe, got {type(df)}"
157+
f"expected Pandas, PySpark, or Polars dataframe, got {type(df)}"
148158
)
149159

150160
@property
151161
def native_type(self) -> NativeOutputType:
152162
"""Get the native type of the dataframe.
153163
154164
Returns:
155-
NativeType: Either 'pandas' or 'pyspark'
165+
NativeOutputType: Either 'pandas', 'pyspark', or 'polars-eager'
156166
"""
157167
return self._implementation.name
158168

169+
@property
170+
def lazy(self) -> bool:
171+
"""Whether the underlying dataframe uses lazy evaluation.
172+
173+
Lazy dataframes (e.g. PySpark) defer computation until results are collected,
174+
while eager ones (e.g. pandas, Polars) evaluate immediately.
175+
"""
176+
return self._implementation.lazy
177+
159178
@property
160179
def columns(self) -> Tuple[Column, ...]:
161180
"""Get the list of columns in the dataframe.
@@ -425,3 +444,9 @@ def is_wrapped_pandas_dataframe(df: DataFrame) -> TypeGuard[DataFrame[PandasDF]]
425444

426445
def is_wrapped_pyspark_dataframe(df: DataFrame) -> TypeGuard[DataFrame[PysparkDF]]:
427446
return df.native_type == "pyspark"
447+
448+
449+
def is_wrapped_polars_eager_dataframe(
450+
df: DataFrame,
451+
) -> TypeGuard[DataFrame[PolarsEagerDF]]:
452+
return df.native_type == "polars-eager"

deepnote_toolkit/ocelots/pandas/implementation.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
from typing_extensions import Self
66

7+
from deepnote_toolkit.logging import LoggerManager
78
from deepnote_toolkit.ocelots.constants import (
89
DEEPNOTE_INDEX_COLUMN,
910
MAX_COLUMNS_TO_DISPLAY,
@@ -21,11 +22,14 @@
2122
flatten_column_name,
2223
)
2324

25+
logger = LoggerManager().get_logger()
26+
2427

2528
class PandasImplementation:
2629
"""Implementation of DataFrame methods for pandas dataframes."""
2730

2831
name: Literal["pandas"] = "pandas"
32+
lazy: bool = False
2933

3034
def __init__(self, df: pd.DataFrame):
3135
self._df = df
@@ -284,7 +288,8 @@ def filter(self, *filters: Filter) -> Self:
284288

285289
masks.append(mask)
286290

287-
except (ValueError, TypeError):
291+
except (ValueError, TypeError) as e:
292+
logger.warning("Skipping filter on column %r: %s", filter_obj.column, e)
288293
continue
289294

290295
if masks:

deepnote_toolkit/ocelots/polars/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)