diff --git a/daft_lance/_lance.py b/daft_lance/_lance.py index 21b7c67..18c291a 100644 --- a/daft_lance/_lance.py +++ b/daft_lance/_lance.py @@ -17,7 +17,7 @@ from .lance_compaction import compact_files_internal from .lance_merge_column import merge_columns_from_df, merge_columns_internal from .lance_scalar_index import create_scalar_index_internal -from .lance_scan import LanceDBScanOperator +from .lance_scan import LanceScanOperator from .utils import construct_lance_dataset if TYPE_CHECKING: @@ -43,12 +43,12 @@ def read_lance( include_fragment_id: bool | None = None, checkpoint: CheckpointConfig | None = None, ) -> DataFrame: - """Create a DataFrame from a LanceDB table. + """Create a DataFrame from a Lance dataset. Args: uri: The URI of the Lance table to read from. Accepts a local path or an object-store URI like "s3://bucket/path". - io_config: A custom IOConfig to use when accessing LanceDB data. Defaults to None. + io_config: A custom IOConfig to use when accessing Lance data. Defaults to None. version : optional, int | str If specified, load a specific version of the Lance dataset. Else, loads the latest version. A version number (`int`) or a tag (`str`) can be provided. @@ -99,25 +99,25 @@ def read_lance( already exists in the store are skipped on re-run. Requires the Ray runner. Returns: - DataFrame: a DataFrame with the schema converted from the specified LanceDB table + DataFrame: a DataFrame with the schema converted from the specified Lance dataset - This function requires the use of [LanceDB](https://lancedb.github.io/lancedb/), which is the Python library for the LanceDB project. + This function reads Lance datasets via the Lance Python package. To ensure that this is installed with Daft, you may install: `pip install daft[lance]` Examples: - Read a local LanceDB table: + Read a local Lance dataset: >>> df = daft.read_lance("/path/to/lance/data/") >>> df.show() - Read a LanceDB table and specify a version: + Read a Lance dataset and specify a version: >>> df = daft.read_lance("/path/to/lance/data/", version=1) >>> df.show() - Read a LanceDB table with fragment grouping: + Read a Lance dataset with fragment grouping: >>> df = daft.read_lance("/path/to/lance/data/", fragment_group_size=5) >>> df.show() - Read a LanceDB table from a public S3 bucket: + Read a Lance dataset from a public S3 bucket: >>> from daft.io import S3Config, IOConfig >>> io_config = IOConfig(s3=S3Config(region="us-west-2", anonymous=True)) >>> df = daft.read_lance("s3://daft-oss-public-data/lance/words-test-dataset", io_config=io_config) @@ -147,7 +147,7 @@ def read_lance( metadata_cache_size_bytes=metadata_cache_size_bytes, ) - lance_operator = LanceDBScanOperator( + lance_operator = LanceScanOperator( ds, fragment_group_size=fragment_group_size, include_fragment_id=include_fragment_id, @@ -178,14 +178,14 @@ def merge_columns( default_scan_options: dict[str, Any] | None = None, metadata_cache_size_bytes: int | None = None, ) -> LanceDataset: - """Merge new columns into a LanceDB table using a transformation function. + """Merge new columns into a Lance dataset using a transformation function. - This function modifies the LanceDB table in-place by adding new columns computed + This function modifies the Lance dataset in-place by adding new columns computed from existing data using a transformation function. It does not return a DataFrame. Args: uri: The URI of the Lance table (supports remote URLs to object stores such as `s3://` or `gs://`) - io_config: A custom IOConfig to use when accessing LanceDB data. Defaults to None. + io_config: A custom IOConfig to use when accessing Lance data. Defaults to None. transform: A transformation function or UDF to apply to the data. read_columns: List of column names to read for the transformation. reader_schema: Schema for the reader. @@ -204,17 +204,17 @@ def merge_columns( None: This function modifies the table in-place and does not return a value. Note: - This function requires the use of [LanceDB](https://lancedb.github.io/lancedb/), which is the Python library for the LanceDB project. + This function writes Lance datasets via the Lance Python package. To ensure that this is installed with Daft, you may install: `pip install daft[lance]` Examples: - Merge new columns into a local LanceDB table: + Merge new columns into a local Lance dataset: >>> def double_score(batch): ... # Example transformation function ... import pyarrow.compute as pc ... ... return batch.append_column("new_column", pc.multiply(batch["c"], 2)) - >>> daft_lance.merge_columns("s3://my-lancedb-bucket/data/", transform=double_score) + >>> daft_lance.merge_columns("s3://my-lance-bucket/data/", transform=double_score) """ if transform is None: raise ValueError( @@ -273,13 +273,13 @@ def merge_columns_df( ) -> Any: """Row-level merge columns entrypoint using a DataFrame. - This function modifies the LanceDB table in-place by merging new columns from a DataFrame + This function modifies the Lance dataset in-place by merging new columns from a DataFrame into existing fragments using a row-level join. It does not return a DataFrame. Args: df: DataFrame containing the new columns to merge along with fragment_id and join key columns - uri: URL to the LanceDB table (supports remote URLs to object stores such as `s3://` or `gs://`) - io_config: A custom IOConfig to use when accessing LanceDB data. Defaults to None. + uri: URL to the Lance dataset (supports remote URLs to object stores such as `s3://` or `gs://`) + io_config: A custom IOConfig to use when accessing Lance data. Defaults to None. read_columns: List of column names to read for the transformation. reader_schema: Schema for the reader. storage_options: Extra options for storage connection. @@ -300,22 +300,22 @@ def merge_columns_df( None: This function modifies the table in-place and does not return a value. Note: - This function requires the use of [LanceDB](https://lancedb.github.io/lancedb/), which is the Python library for the LanceDB project. + This function writes Lance datasets via the Lance Python package. To ensure that this is installed with Daft, you may install: `pip install daft[lance]` Examples: - Merge new columns into a local LanceDB table: + Merge new columns into a local Lance dataset: >>> import daft >>> # Read the existing table with row addresses >>> df = daft.read_lance( - ... "s3://my-lancedb-bucket/data/", + ... "s3://my-lance-bucket/data/", ... default_scan_options={"with_row_address": True}, ... include_fragment_id=True, ... ) >>> # Add new columns based on existing data >>> df = df.with_column("doubled_c", df["c"] * 2) >>> # Merge the new columns back to the table - >>> daft_lance.merge_columns_df(df, "s3://my-lancedb-bucket/data/") + >>> daft_lance.merge_columns_df(df, "s3://my-lance-bucket/data/") """ io_config = context.get_context().daft_planning_config.default_io_config if io_config is None else io_config storage_options = storage_options or io_config_to_storage_options(io_config, uri) @@ -385,7 +385,7 @@ def create_scalar_index( Args: uri: The URI of the Lance table (supports remote URLs to object stores such as `s3://` or `gs://`) - io_config: A custom IOConfig to use when accessing LanceDB data. Defaults to None. + io_config: A custom IOConfig to use when accessing Lance data. Defaults to None. column: Column name to index index_type: Type of index to build. For distributed execution this supports "INVERTED", "FTS", and "BTREE". @@ -424,7 +424,7 @@ def create_scalar_index( ImportError: If lance package is not available Note: - This function requires the use of [LanceDB](https://lancedb.github.io/lancedb/), which is the Python library for the LanceDB project. + This function writes Lance datasets via the Lance Python package. To ensure that this is installed with Daft, you may install: `pip install daft[lance]` Examples: @@ -510,7 +510,7 @@ def compact_files( Args: uri: The URI of the Lance table (supports remote URLs to object stores such as `s3://` or `gs://`) - io_config: A custom IOConfig to use when accessing LanceDB data. Defaults to None. + io_config: A custom IOConfig to use when accessing Lance data. Defaults to None. storage_options: Extra options for storage connection. version: If specified, load a specific version of the Lance dataset. asof: If specified, find the latest version created on or earlier than the given argument value. diff --git a/daft_lance/lance_scan.py b/daft_lance/lance_scan.py index de3349f..dbbc98d 100644 --- a/daft_lance/lance_scan.py +++ b/daft_lance/lance_scan.py @@ -23,7 +23,7 @@ # TODO support fts and fast_search -def _lancedb_table_factory_function( +def _lance_table_factory_function( ds_uri: str, open_kwargs: dict[Any, Any] | None = None, fragment_ids: list[int] | None = None, @@ -110,13 +110,13 @@ def _batches() -> Iterator[PyRecordBatch]: return _iter_batches() -def _lancedb_count_result_function( +def _lance_count_result_function( ds_uri: str, open_kwargs: dict[Any, Any] | None, required_column: str, filter: pa.compute.Expression | None = None, ) -> Iterator[PyRecordBatch]: - """Use LanceDB's API to count rows and return a record batch with the count result.""" + """Use Lance's API to count rows and return a record batch with the count result.""" ds = lance.dataset(ds_uri, **(open_kwargs or {})) logger.debug("Using metadata for counting all rows") count = ds.count_rows(filter=filter) @@ -128,7 +128,7 @@ def _lancedb_count_result_function( yield result_batch._recordbatch -class LanceDBScanOperator(ScanOperator, SupportsPushdownFilters): +class LanceScanOperator(ScanOperator, SupportsPushdownFilters): def __init__( self, ds: lance.LanceDataset, @@ -147,10 +147,10 @@ def __init__( self._schema = convert_lance_schema(base) def name(self) -> str: - return "LanceDBScanOperator" + return "LanceScanOperator" def display_name(self) -> str: - return f"LanceDBScanOperator({self._ds.uri})" + return f"LanceScanOperator({self._ds.uri})" def schema(self) -> Schema: return self._schema @@ -254,8 +254,8 @@ def _create_count_rows_scan_task(self, pushdowns: PyPushdowns) -> Iterator[ScanT new_schema = Schema.from_pyarrow_schema(pa.schema([pa.field(fields[0], pa.uint64())])) open_kwargs = getattr(self._ds, "_lance_open_kwargs", None) yield ScanTask.python_factory_func_scan_task( - module=_lancedb_count_result_function.__module__, - func_name=_lancedb_count_result_function.__name__, + module=_lance_count_result_function.__module__, + func_name=_lance_count_result_function.__name__, func_args=(self._ds.uri, open_kwargs, fields[0], self._combine_filters_to_arrow()), schema=new_schema._schema, num_rows=1, @@ -293,8 +293,8 @@ def _create_scan_tasks_with_limit_and_no_filters( task_schema = self._schema yield ScanTask.python_factory_func_scan_task( - module=_lancedb_table_factory_function.__module__, - func_name=_lancedb_table_factory_function.__name__, + module=_lance_table_factory_function.__module__, + func_name=_lance_table_factory_function.__name__, func_args=( self._ds.uri, open_kwargs, @@ -328,8 +328,8 @@ def _python_factory_func_scan_task( size_bytes: int | None = None, ) -> ScanTask: return ScanTask.python_factory_func_scan_task( - module=_lancedb_table_factory_function.__module__, - func_name=_lancedb_table_factory_function.__name__, + module=_lance_table_factory_function.__module__, + func_name=_lance_table_factory_function.__name__, func_args=( self._ds.uri, open_kwargs, diff --git a/tests/io/lancedb/__init__.py b/tests/io/lance/__init__.py similarity index 100% rename from tests/io/lancedb/__init__.py rename to tests/io/lance/__init__.py diff --git a/tests/io/lancedb/conftest.py b/tests/io/lance/conftest.py similarity index 100% rename from tests/io/lancedb/conftest.py rename to tests/io/lance/conftest.py diff --git a/tests/io/lancedb/test_blob_v2_policy.py b/tests/io/lance/test_blob_v2_policy.py similarity index 100% rename from tests/io/lancedb/test_blob_v2_policy.py rename to tests/io/lance/test_blob_v2_policy.py diff --git a/tests/io/lancedb/test_expression_point_lookup.py b/tests/io/lance/test_expression_point_lookup.py similarity index 100% rename from tests/io/lancedb/test_expression_point_lookup.py rename to tests/io/lance/test_expression_point_lookup.py diff --git a/tests/io/lancedb/test_fast_path_merge.py b/tests/io/lance/test_fast_path_merge.py similarity index 100% rename from tests/io/lancedb/test_fast_path_merge.py rename to tests/io/lance/test_fast_path_merge.py diff --git a/tests/io/lancedb/test_lance_batching.py b/tests/io/lance/test_lance_batching.py similarity index 100% rename from tests/io/lancedb/test_lance_batching.py rename to tests/io/lance/test_lance_batching.py diff --git a/tests/io/lancedb/test_lance_blob_v2_e2e.py b/tests/io/lance/test_lance_blob_v2_e2e.py similarity index 100% rename from tests/io/lancedb/test_lance_blob_v2_e2e.py rename to tests/io/lance/test_lance_blob_v2_e2e.py diff --git a/tests/io/lancedb/test_lance_blob_v2_read.py b/tests/io/lance/test_lance_blob_v2_read.py similarity index 100% rename from tests/io/lancedb/test_lance_blob_v2_read.py rename to tests/io/lance/test_lance_blob_v2_read.py diff --git a/tests/io/lancedb/test_lance_blob_v2_write.py b/tests/io/lance/test_lance_blob_v2_write.py similarity index 100% rename from tests/io/lancedb/test_lance_blob_v2_write.py rename to tests/io/lance/test_lance_blob_v2_write.py diff --git a/tests/io/lancedb/test_lancedb_compaction.py b/tests/io/lance/test_lance_compaction.py similarity index 100% rename from tests/io/lancedb/test_lancedb_compaction.py rename to tests/io/lance/test_lance_compaction.py diff --git a/tests/io/lancedb/test_lancedb_count_pushdown_coverage.py b/tests/io/lance/test_lance_count_pushdown_coverage.py similarity index 88% rename from tests/io/lancedb/test_lancedb_count_pushdown_coverage.py rename to tests/io/lance/test_lance_count_pushdown_coverage.py index 076a7b4..779b2e9 100644 --- a/tests/io/lancedb/test_lancedb_count_pushdown_coverage.py +++ b/tests/io/lance/test_lance_count_pushdown_coverage.py @@ -11,7 +11,7 @@ from daft import col from daft.daft import CountMode from daft.recordbatch import RecordBatch -from daft_lance.lance_scan import LanceDBScanOperator, _lancedb_count_result_function +from daft_lance.lance_scan import LanceScanOperator, _lance_count_result_function class TestLanceCountResultFunction: @@ -30,20 +30,20 @@ def test_dataset_path(self, tmp_path_factory): lance.write_dataset(pa.Table.from_pydict(test_data), tmp_dir) yield str(tmp_dir) - def test_lancedb_count_no_filters_direct_call(self, test_dataset_path): + def test_lance_count_no_filters_direct_call(self, test_dataset_path): """Test that no filters list is handled correctly.""" ds = lance.dataset(test_dataset_path) - result_generator = _lancedb_count_result_function(ds.uri, None, "count") + result_generator = _lance_count_result_function(ds.uri, None, "count") result_batch = next(result_generator) record_batch = RecordBatch._from_pyrecordbatch(result_batch) result_dict = record_batch.to_pydict() assert result_dict["count"][0] == 6 - def test_lancedb_count_with_filters_path(self, test_dataset_path): + def test_lance_count_with_filters_path(self, test_dataset_path): """Test that filters list is handled correctly.""" ds = lance.dataset(test_dataset_path) filter_expr = pc.greater(pc.field("age"), pc.scalar(30)) - result_generator = _lancedb_count_result_function(ds.uri, None, "count", filter=filter_expr) + result_generator = _lance_count_result_function(ds.uri, None, "count", filter=filter_expr) result_batch = next(result_generator) record_batch = RecordBatch._from_pyrecordbatch(result_batch) result_dict = record_batch.to_pydict() @@ -52,7 +52,7 @@ def test_lancedb_count_with_filters_path(self, test_dataset_path): def test_unsupported_count_mode_fallback(self, test_dataset_path): """Test that unsupported count mode falls back to regular scan.""" ds = lance.dataset(test_dataset_path) - scan_op = LanceDBScanOperator(ds) + scan_op = LanceScanOperator(ds) with patch.object(scan_op, "supported_count_modes", return_value=[CountMode.All]): with patch("daft_lance.lance_scan.logger") as mock_logger: @@ -77,7 +77,7 @@ def test_unsupported_count_mode_fallback(self, test_dataset_path): def test_empty_filters_list_handling(self, test_dataset_path): """Test that empty filters list is handled correctly.""" ds = lance.dataset(test_dataset_path) - scan_op = LanceDBScanOperator(ds) + scan_op = LanceScanOperator(ds) pushed, remaining = scan_op.push_filters([]) assert len(pushed) == 0 diff --git a/tests/io/lancedb/test_lancedb_custom_task.py b/tests/io/lance/test_lance_custom_task.py similarity index 100% rename from tests/io/lancedb/test_lancedb_custom_task.py rename to tests/io/lance/test_lance_custom_task.py diff --git a/tests/io/lancedb/test_lance_data_sink_internals.py b/tests/io/lance/test_lance_data_sink_internals.py similarity index 100% rename from tests/io/lancedb/test_lance_data_sink_internals.py rename to tests/io/lance/test_lance_data_sink_internals.py diff --git a/tests/io/lancedb/test_lance_data_sink_storage_versions.py b/tests/io/lance/test_lance_data_sink_storage_versions.py similarity index 100% rename from tests/io/lancedb/test_lance_data_sink_storage_versions.py rename to tests/io/lance/test_lance_data_sink_storage_versions.py diff --git a/tests/io/lancedb/test_lancedb_factory_function.py b/tests/io/lance/test_lance_factory_function.py similarity index 95% rename from tests/io/lancedb/test_lancedb_factory_function.py rename to tests/io/lance/test_lance_factory_function.py index a02f8a8..c68ca80 100644 --- a/tests/io/lancedb/test_lancedb_factory_function.py +++ b/tests/io/lance/test_lance_factory_function.py @@ -7,7 +7,7 @@ import pytest from daft.recordbatch import RecordBatch -from daft_lance.lance_scan import _lancedb_table_factory_function +from daft_lance.lance_scan import _lance_table_factory_function # Import-or-skip lance once at module level so individual tests don't need to do this lance = pytest.importorskip("lance") @@ -57,7 +57,7 @@ def test_reconstructs_dataset_and_reads_fragments( # Collect records from factory out_batches = list( - _lancedb_table_factory_function( + _lance_table_factory_function( ds_uri=ds_direct.uri, open_kwargs=open_kwargs, fragment_ids=frag_ids, @@ -89,7 +89,7 @@ def test_raises_when_no_fragments(tmp_path_factory): with pytest.raises(RuntimeError) as ei: list( - _lancedb_table_factory_function( + _lance_table_factory_function( ds_uri=ds.uri, open_kwargs=None, fragment_ids=[], @@ -110,7 +110,7 @@ def test_invalid_fragment_id_raises(tmp_path_factory): with pytest.raises(Exception) as ei: list( - _lancedb_table_factory_function( + _lance_table_factory_function( ds_uri=ds.uri, open_kwargs=None, fragment_ids=[invalid_id], @@ -141,7 +141,7 @@ def test_open_kwargs_version_selects_correct_version(tmp_path_factory): frag_ids_v1 = [f.fragment_id for f in ds_v1.get_fragments()] out_batches = list( - _lancedb_table_factory_function( + _lance_table_factory_function( ds_uri=ds_v2.uri, open_kwargs={"version": 1}, fragment_ids=frag_ids_v1, diff --git a/tests/io/lancedb/test_lance_merge_evolution.py b/tests/io/lance/test_lance_merge_evolution.py similarity index 100% rename from tests/io/lancedb/test_lance_merge_evolution.py rename to tests/io/lance/test_lance_merge_evolution.py diff --git a/tests/io/lancedb/test_lancedb_point_lookup.py b/tests/io/lance/test_lance_point_lookup.py similarity index 97% rename from tests/io/lancedb/test_lancedb_point_lookup.py rename to tests/io/lance/test_lance_point_lookup.py index 92b07ff..782add2 100644 --- a/tests/io/lancedb/test_lancedb_point_lookup.py +++ b/tests/io/lance/test_lance_point_lookup.py @@ -20,7 +20,7 @@ def lance_dataset(tmp_path_factory): def _scan(ds): - return lance_scan.LanceDBScanOperator(ds) + return lance_scan.LanceScanOperator(ds) @pytest.mark.parametrize( @@ -102,7 +102,7 @@ def test_scanner_without_fragments(lance_dataset, idx_type): arrow_filter = Expression._from_pyexpr((col("id") == 2)._expr).to_arrow_expr() # Invoke factory with fragment_ids=None to exercise index-driven fragment selection - gen = lance_scan._lancedb_table_factory_function( + gen = lance_scan._lance_table_factory_function( ds.uri, getattr(ds, "_lance_open_kwargs", None), None, diff --git a/tests/io/lancedb/test_lancedb_reads.py b/tests/io/lance/test_lance_reads.py similarity index 88% rename from tests/io/lancedb/test_lancedb_reads.py rename to tests/io/lance/test_lance_reads.py index ca5cb7b..2e73c5e 100644 --- a/tests/io/lancedb/test_lancedb_reads.py +++ b/tests/io/lance/test_lance_reads.py @@ -18,18 +18,18 @@ def lance_dataset_path(tmp_path_factory): yield str(tmp_dir) -def test_lancedb_read(lance_dataset_path): +def test_lance_read(lance_dataset_path): df = daft.read_lance(lance_dataset_path) assert df.to_pydict() == data -def test_lancedb_read_column_selection(lance_dataset_path): +def test_lance_read_column_selection(lance_dataset_path): df = daft.read_lance(lance_dataset_path) df = df.select("vector") assert df.to_pydict() == {"vector": data["vector"]} -def test_lancedb_read_filter(lance_dataset_path): +def test_lance_read_filter(lance_dataset_path): df = daft.read_lance(lance_dataset_path) df = df.where((df["lat"] > 45) & (df["lat"] < 90)) df = df.select("vector") @@ -68,7 +68,7 @@ def large_lance_dataset_path(tmp_path_factory): (10000, 10), ], ) -def test_lancedb_read_limit_large_dataset(large_lance_dataset_path, limit_size, expected_scan_tasks): +def test_lance_read_limit_large_dataset(large_lance_dataset_path, limit_size, expected_scan_tasks): """Test limit operation on a large Lance dataset with multiple fragments.""" import io @@ -97,12 +97,12 @@ def test_lancedb_read_limit_large_dataset(large_lance_dataset_path, limit_size, assert result["big_int"] == expected_big_ints -def test_lancedb_with_version(lance_dataset_path): +def test_lance_with_version(lance_dataset_path): df = daft.read_lance(uri=lance_dataset_path, version=1) assert df.to_pydict() == data # test pushdown filters with limit and projection - def test_lancedb_read_pushdown(lance_dataset_path, capsys): + def test_lance_read_pushdown(lance_dataset_path, capsys): df = daft.read_lance(lance_dataset_path) df = daft.sql("SELECT vector, lat + 1 as lat_plus_1 FROM df where long < 3 limit 1") df.explain(show_all=True) @@ -139,7 +139,7 @@ def test_lancedb_read_pushdown(lance_dataset_path, capsys): ) -def test_lancedb_read_parallelism_fragment_merging(large_lance_dataset_path): +def test_lance_read_parallelism_fragment_merging(large_lance_dataset_path): """Test parallelism parameter reduces scan tasks by merging fragments.""" df = daft.read_lance(uri=large_lance_dataset_path, fragment_group_size=3) result = df.to_pydict() @@ -147,7 +147,7 @@ def test_lancedb_read_parallelism_fragment_merging(large_lance_dataset_path): assert len(result["big_int"]) == 10000 -def test_lancedb_read_filter_passthrough(tmp_path): +def test_lance_read_filter_passthrough(tmp_path): """Test passing raw SQL filter string to Lance via default_scan_options.""" import lance @@ -164,8 +164,8 @@ def test_lancedb_read_filter_passthrough(tmp_path): assert sorted(res["id"]) == [1, 2] -def test_lancedb_geo_projection_and_filter(tmp_path): - """Test LanceDB read with Geo projection and filter via default_scan_options.""" +def test_lance_geo_projection_and_filter(tmp_path): + """Test Lance read with Geo projection and filter via default_scan_options.""" import lance try: @@ -250,7 +250,7 @@ def test_lancedb_geo_projection_and_filter(tmp_path): assert abs(res["distance"][0]) < 1e-6 -class TestLanceDBCountPushdown: +class TestLanceCountPushdown: tmp_data = { "a": ["a", "b", "c", "d", "e", None], "b": [1, None, 3, None, 5, 6], @@ -271,7 +271,7 @@ def test_count_all_pushdown(self, dataset_path, capsys): df.explain(True) actual = capsys.readouterr() assert "Pushdowns: {projection: [b], aggregation: count(col(b), All)}" in actual.out - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path result = df.to_pydict() assert result == {"count": [6]} @@ -284,7 +284,7 @@ def test_count_column_no_pushdown(self, dataset_path, capsys): df.explain(True) actual = capsys.readouterr() assert "Pushdowns: {projection: [a], aggregation: count(col(a), All)}" not in actual.out - assert "_lancedb_count_result_function" not in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" not in actual.out # Accept daft or daft_lance module path result = df.to_pydict() assert result == {"a": [5]} @@ -297,7 +297,7 @@ def test_count_pushdown_with_select(self, dataset_path, capsys): df.explain(True) actual = capsys.readouterr() assert "Pushdowns: {projection: [b], aggregation: count(col(b), All)}" in actual.out - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path result = df.to_pydict() assert result == {"count": [6]} @@ -310,7 +310,7 @@ def test_count_with_filter_pushdown(self, dataset_path, capsys): _ = capsys.readouterr() df.explain(True) actual = capsys.readouterr() - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path assert "Filter pushdown = is_null(col(b))" in actual.out assert "Aggregation pushdown = count(col(b), All)" in actual.out @@ -325,7 +325,7 @@ def test_count_with_or_filter_pushdown(self, dataset_path, capsys): _ = capsys.readouterr() df.explain(True) actual = capsys.readouterr() - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path assert "Filter pushdown = is_null(col(b)) | is_null(col(c))" in actual.out assert "Aggregation pushdown = count(col(b), All)" in actual.out @@ -341,7 +341,7 @@ def test_count_with_and_filter_pushdown(self, dataset_path, capsys): _ = capsys.readouterr() df.explain(True) actual = capsys.readouterr() - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path assert "Aggregation pushdown" in actual.out assert "Filter pushdown" in actual.out @@ -356,7 +356,7 @@ def test_count_with_filter_and_select_pushdown(self, dataset_path, capsys): _ = capsys.readouterr() df.explain(True) actual = capsys.readouterr() - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path result = df.to_pydict() assert result == {"count": [4]} @@ -373,7 +373,7 @@ def test_edge_case_empty_dataset(self, tmp_path_factory, capsys): df.explain(True) actual = capsys.readouterr() assert "Pushdowns: {projection: [a], aggregation: count(col(a), All)}" in actual.out - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path result = df.to_pydict() assert result == {"count": [0]} @@ -386,14 +386,14 @@ def test_count_1(self, dataset_path, capsys): df.explain(True) actual = capsys.readouterr() assert "Pushdowns: {projection: [b], aggregation: count(col(b), All)}" in actual.out - assert "_lancedb_count_result_function" in actual.out # Accept daft or daft_lance module path + assert "_lance_count_result_function" in actual.out # Accept daft or daft_lance module path result = df.to_pydict() assert result == {"count": [6]} @pytest.mark.parametrize("enable_strict_filter_pushdown", [True, False]) -def test_lancedb_filter_then_limit_behavior(lance_dataset_path, enable_strict_filter_pushdown): +def test_lance_filter_then_limit_behavior(lance_dataset_path, enable_strict_filter_pushdown): """Ensure filter is applied before limit for Lance reads.""" daft.context.set_planning_config(enable_strict_filter_pushdown=enable_strict_filter_pushdown) df = daft.read_lance(lance_dataset_path) @@ -408,7 +408,7 @@ def test_lancedb_filter_then_limit_behavior(lance_dataset_path, enable_strict_fi assert result3 == {"vector": [[0.2, 1.8]], "lat": [40.1], "long": [-74.1], "big_int": [2]} -def test_lancedb_limit_with_filter_and_fragment_grouping_single_task(large_lance_dataset_path): +def test_lance_limit_with_filter_and_fragment_grouping_single_task(large_lance_dataset_path): """Validate filter+limit correctness when fragment grouping is enabled.""" df = daft.read_lance(uri=large_lance_dataset_path, fragment_group_size=4) df = df.filter("big_int = 999").limit(1).select("big_int") diff --git a/tests/io/lancedb/test_lancedb_scalar_index.py b/tests/io/lance/test_lance_scalar_index.py similarity index 99% rename from tests/io/lancedb/test_lancedb_scalar_index.py rename to tests/io/lance/test_lance_scalar_index.py index ffa011f..dfd3327 100644 --- a/tests/io/lancedb/test_lancedb_scalar_index.py +++ b/tests/io/lance/test_lance_scalar_index.py @@ -399,9 +399,9 @@ def test_build_distributed_index_fts_type(self, multi_fragment_lance_dataset): """Test building distributed FTS (Full-Text Search) index.""" dataset_uri = multi_fragment_lance_dataset - # Skip this test if FTS is not supported in the current LanceDB version - # This test will be enabled when LanceDB version supports FTS index type - pytest.skip("FTS index type may not be supported in the current LanceDB version") + # Skip this test if FTS is not supported in the current Lance version + # This test will be enabled when Lance version supports FTS index type + pytest.skip("FTS index type may not be supported in the current Lance version") # Build distributed FTS index create_scalar_index( diff --git a/tests/io/lancedb/test_lancedb_vector_search.py b/tests/io/lance/test_lance_vector_search.py similarity index 100% rename from tests/io/lancedb/test_lancedb_vector_search.py rename to tests/io/lance/test_lance_vector_search.py diff --git a/tests/io/lancedb/test_lancedb_writes.py b/tests/io/lance/test_lance_writes.py similarity index 96% rename from tests/io/lancedb/test_lancedb_writes.py rename to tests/io/lance/test_lance_writes.py index 82c78b1..0b84016 100644 --- a/tests/io/lancedb/test_lancedb_writes.py +++ b/tests/io/lance/test_lance_writes.py @@ -41,7 +41,7 @@ def minio_create_bucket(minio_io_config, bucket_name): yield -def test_lancedb_roundtrip(lance_dataset_path): +def test_lance_roundtrip(lance_dataset_path): df1 = daft.from_pydict(data1) df2 = daft.from_pydict(data2) df1.write_lance(lance_dataset_path, mode="create") @@ -52,7 +52,7 @@ def test_lancedb_roundtrip(lance_dataset_path): @pytest.mark.integration() -def test_lancedb_minio(minio_io_config): +def test_lance_minio(minio_io_config): df1 = daft.from_pydict(data1) df2 = daft.from_pydict(data2) bucket_name = "lance" @@ -64,7 +64,7 @@ def test_lancedb_minio(minio_io_config): assert df_loaded.to_pydict() == df1.concat(df2).to_pydict() -def test_lancedb_write_with_schema(lance_dataset_path): +def test_lance_write_with_schema(lance_dataset_path): """Writing a dataframe to lance with a user-provided schema with lance encodings.""" data = { "vector": [1.1, 1.2], @@ -99,7 +99,7 @@ def test_lancedb_write_with_schema(lance_dataset_path): assert compress_field_metadata[b"lance-encoding:compression"] == b"zstd" -def test_lancedb_write_blob(lance_dataset_path): +def test_lance_write_blob(lance_dataset_path): schema = pa.schema( [ pa.field("blob", pa.large_binary(), metadata={"lance-encoding:blob": "true"}), @@ -130,7 +130,7 @@ def test_lancedb_write_blob(lance_dataset_path): assert f.read() == expected -def test_lancedb_write_string(lance_dataset_path): +def test_lance_write_string(lance_dataset_path): import lance # Make lance dataset with a string column @@ -149,7 +149,7 @@ def test_lancedb_write_string(lance_dataset_path): assert df_loaded.to_pydict() == data -def test_lancedb_write_incompatible_schema(lance_dataset_path): +def test_lance_write_incompatible_schema(lance_dataset_path): import lance # Make lance dataset with an int and a string column @@ -166,7 +166,7 @@ def test_lancedb_write_incompatible_schema(lance_dataset_path): df.write_lance(lance_dataset_path, mode="append") -def test_lancedb_write_with_create_append_mode(lance_dataset_path): +def test_lance_write_with_create_append_mode(lance_dataset_path): import lance # Make lance dataset with a string column diff --git a/tests/io/lancedb/test_mem_wal_writes.py b/tests/io/lance/test_mem_wal_writes.py similarity index 100% rename from tests/io/lancedb/test_mem_wal_writes.py rename to tests/io/lance/test_mem_wal_writes.py