From 5620779ad0b381d9e7f7c107565adde49e46d4fb Mon Sep 17 00:00:00 2001 From: vinaysurtani Date: Sat, 20 Jun 2026 14:59:19 -0700 Subject: [PATCH 1/3] Add Pydantic model instance support in _coerce_reader - Add _PYDANTIC_AVAILABLE flag and _check_for_pydantic() to dependencies.py - Add Pydantic branch in _coerce_reader() in types.py before generic Iterable branch to auto-convert BaseModel instances via model_dump() - Supports both Pydantic v1 (.dict()) and v2 (.model_dump()) Fixes part of issue #1106 --- python/python/lance/dependencies.py | 10 ++++++++++ python/python/lance/types.py | 20 +++++++++++++++++++- reproduce.py | 25 +++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 reproduce.py diff --git a/python/python/lance/dependencies.py b/python/python/lance/dependencies.py index df6b71cb22f..fc92970308b 100644 --- a/python/python/lance/dependencies.py +++ b/python/python/lance/dependencies.py @@ -28,6 +28,7 @@ _RAFT_COMMON_AVAILABLE = True _HUGGING_FACE_AVAILABLE = True _TENSORFLOW_AVAILABLE = True +_PYDANTIC_AVAILABLE = True class _LazyModule(ModuleType): @@ -173,6 +174,7 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]: torch, _TORCH_AVAILABLE = _lazy_import("torch") datasets, _HUGGING_FACE_AVAILABLE = _lazy_import("datasets") tensorflow, _TENSORFLOW_AVAILABLE = _lazy_import("tensorflow") + _, _PYDANTIC_AVAILABLE = _lazy_import("pydantic") @lru_cache(maxsize=None) @@ -221,6 +223,12 @@ def _check_for_tensorflow(obj: Any, *, check_type: bool = True) -> bool: ) +def _check_for_pydantic(obj: Any, *, check_type: bool = True) -> bool: + return _PYDANTIC_AVAILABLE and _might_be( + cast("Hashable", type(obj) if check_type else obj), "pydantic" + ) + + __all__ = [ # lazy-load third party libs "datasets", @@ -234,6 +242,7 @@ def _check_for_tensorflow(obj: Any, *, check_type: bool = True) -> bool: "_check_for_numpy", "_check_for_pandas", "_check_for_polars", + "_check_for_pydantic", "_check_for_tensorflow", "_check_for_torch", "_LazyModule", @@ -241,6 +250,7 @@ def _check_for_tensorflow(obj: Any, *, check_type: bool = True) -> bool: "_NUMPY_AVAILABLE", "_PANDAS_AVAILABLE", "_POLARS_AVAILABLE", + "_PYDANTIC_AVAILABLE", "_TORCH_AVAILABLE", "_HUGGING_FACE_AVAILABLE", "_TENSORFLOW_AVAILABLE", diff --git a/python/python/lance/types.py b/python/python/lance/types.py index 41cc191e4d6..2a9e271a660 100644 --- a/python/python/lance/types.py +++ b/python/python/lance/types.py @@ -9,7 +9,11 @@ from pyarrow import RecordBatch from . import dataset -from .dependencies import _check_for_hugging_face, _check_for_pandas +from .dependencies import ( + _check_for_hugging_face, + _check_for_pandas, + _check_for_pydantic, +) from .dependencies import pandas as pd if TYPE_CHECKING: @@ -116,6 +120,20 @@ def batch_iter(): # List of dictionaries batch = pa.RecordBatch.from_pylist(data_obj, schema=schema) return pa.RecordBatchReader.from_batches(batch.schema, [batch]) + elif ( + isinstance(data_obj, list) + and len(data_obj) > 0 + and _check_for_pydantic(data_obj[0]) + ): + from pydantic import BaseModel + + if isinstance(data_obj[0], BaseModel): + dicts = [ + item.model_dump() if hasattr(item, "model_dump") else item.dict() + for item in data_obj + ] + batch = pa.RecordBatch.from_pylist(dicts, schema=schema) + return pa.RecordBatchReader.from_batches(batch.schema, [batch]) # for other iterables, assume they are of type Iterable[RecordBatch] elif isinstance(data_obj, Iterable): if schema is not None: diff --git a/reproduce.py b/reproduce.py new file mode 100644 index 00000000000..0d96a9b4e73 --- /dev/null +++ b/reproduce.py @@ -0,0 +1,25 @@ +import lance +import pyarrow as pa +from pydantic import BaseModel + + +class MyModel(BaseModel): + name: str + score: float + + +data = [MyModel(name="alice", score=0.9), MyModel(name="bob", score=0.8)] + +# After fix: Pydantic instances are handled automatically +print("=== After Fix: Direct Pydantic Instance Support ===") +ds = lance.write_dataset(data, "/tmp/test.lance", mode="overwrite") +print(f"Success! Wrote {ds.count_rows()} rows without manual conversion.") +print(ds.to_table()) + +print() +print("=== Old Workaround (still works) ===") +dicts = [m.model_dump() for m in data] +schema = pa.schema([pa.field("name", pa.string()), pa.field("score", pa.float64())]) +table = pa.Table.from_pylist(dicts, schema=schema) +lance.write_dataset(table, "/tmp/test_manual.lance", mode="overwrite") +print("Manual conversion path still works as expected.") From 5bfdbf5a50cf24c8ce108a52fb997946526cff78 Mon Sep 17 00:00:00 2001 From: vinaysurtani Date: Sat, 20 Jun 2026 15:02:18 -0700 Subject: [PATCH 2/3] Add LanceDataset.from_pydantic_model() classmethod and tests - Add from_pydantic_model() classmethod to LanceDataset that infers table name (snake_case) and schema from a Pydantic model class - Add Pydantic instances to test_input_data parametrized test suite - Add test_from_pydantic_model() to verify classmethod behavior Fixes issue #1106 --- python/python/lance/dataset.py | 39 +++++++++++++++++++++++++++++ python/python/tests/test_dataset.py | 26 +++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 45dc1b253d3..fc45dc159b6 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -793,6 +793,45 @@ def __deserialize__( base_store_params=base_store_params, ) + @classmethod + def from_pydantic_model( + cls, + model_class, + data, + uri: Optional[Union[str, Path]] = None, + mode: str = "create", + **kwargs, + ) -> "LanceDataset": + """Create a LanceDataset from a Pydantic model class and a list of instances. + + The table name is inferred from the model class name converted to snake_case. + The schema is inferred from the data. + + Parameters + ---------- + model_class : type + A Pydantic BaseModel subclass. + data : list + A list of Pydantic model instances. + uri : str or Path, optional + The URI to write the dataset to. If not provided, the model class name + converted to snake_case is used as the path. + mode : str, optional + The write mode. One of "create", "overwrite", or "append". + **kwargs + Additional arguments passed to write_dataset(). + """ + import re + + if uri is None: + uri = re.sub(r"(? Date: Sat, 20 Jun 2026 15:18:22 -0700 Subject: [PATCH 3/3] Remove reproduce.py from tracking, add to .gitignore --- .gitignore | 1 + reproduce.py | 25 ------------------------- 2 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 reproduce.py diff --git a/.gitignore b/.gitignore index dcc9c5089ff..3da82f3e690 100644 --- a/.gitignore +++ b/.gitignore @@ -125,3 +125,4 @@ docs/src/community/project-specific/namespace-impls.md docs/src/community/project-specific/ray.md docs/src/community/project-specific/spark.md docs/src/community/project-specific/trino.md +reproduce.py diff --git a/reproduce.py b/reproduce.py deleted file mode 100644 index 0d96a9b4e73..00000000000 --- a/reproduce.py +++ /dev/null @@ -1,25 +0,0 @@ -import lance -import pyarrow as pa -from pydantic import BaseModel - - -class MyModel(BaseModel): - name: str - score: float - - -data = [MyModel(name="alice", score=0.9), MyModel(name="bob", score=0.8)] - -# After fix: Pydantic instances are handled automatically -print("=== After Fix: Direct Pydantic Instance Support ===") -ds = lance.write_dataset(data, "/tmp/test.lance", mode="overwrite") -print(f"Success! Wrote {ds.count_rows()} rows without manual conversion.") -print(ds.to_table()) - -print() -print("=== Old Workaround (still works) ===") -dicts = [m.model_dump() for m in data] -schema = pa.schema([pa.field("name", pa.string()), pa.field("score", pa.float64())]) -table = pa.Table.from_pylist(dicts, schema=schema) -lance.write_dataset(table, "/tmp/test_manual.lance", mode="overwrite") -print("Manual conversion path still works as expected.")