From 75fd8e2ff42c2f7e4552d42bfe03cf81ee81cefb Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Wed, 17 Jun 2026 22:15:33 +0800 Subject: [PATCH] fix: reject unsupported filter argument in add_columns The filter parameter was accepted and documented but never used, so any filter was silently dropped and every fragment was processed. Raise NotImplementedError instead of silently ignoring it. --- lance_ray/io.py | 13 +++++++++++-- tests/test_basic_read_write.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lance_ray/io.py b/lance_ray/io.py index 270ab287..cc1dd500 100644 --- a/lance_ray/io.py +++ b/lance_ray/io.py @@ -536,9 +536,11 @@ def add_columns( the namespace. transform: The transform to apply to the dataset. It support a lot of types, see `LanceDB API doc https://lancedb.github.io/lance-python-doc/data-evolution.html ` for more details. - filter: The filter to apply to the dataset. It is not supported yet, will be - supported when `get_fragments` support filter see + filter: Not supported yet. Reserved for when `get_fragments` supports a + filter, see `LanceDB API doc `_. + Passing a non-None value raises ``NotImplementedError`` rather than + silently ignoring it. read_columns: The columns from the original dataset to read. reader_schema: The schema to use for the reader. read_version: The version to read. @@ -555,6 +557,13 @@ def add_columns( batch_size: The batch size to use for the reader. concurrency: The number of processes to use for the pool. """ + if filter is not None: + raise NotImplementedError( + "add_columns does not support the 'filter' argument yet; it would be " + "silently ignored and every fragment would still be processed. Omit " + "'filter' until fragment filtering is supported." + ) + validate_uri_or_namespace(uri, namespace_impl, table_id) uri, storage_options = resolve_namespace_table( diff --git a/tests/test_basic_read_write.py b/tests/test_basic_read_write.py index 93c96fe9..2625fb19 100644 --- a/tests/test_basic_read_write.py +++ b/tests/test_basic_read_write.py @@ -355,6 +355,16 @@ def double_score(x: pa.RecordBatch) -> pa.RecordBatch: assert df.columns.tolist() == ["id", "name", "age", "score", "new_column"] assert (df["new_column"] == df["score"] * 2).all() + def test_add_columns_rejects_filter(self, temp_dir): + """A non-None filter must raise instead of being silently ignored.""" + path = Path(temp_dir) / "add_columns_filter.lance" + with pytest.raises(NotImplementedError, match="filter"): + lr.add_columns( + str(path), + transform={"new_column": "score * 2"}, + filter="score > 1", + ) + class TestNamespaceReadWrite: """Test cases for read/write with DirectoryNamespace."""