From f1ab175f542b4e8a9a74b65085de79e129b622be Mon Sep 17 00:00:00 2001 From: Ddupg Date: Mon, 8 Jun 2026 19:23:36 +0800 Subject: [PATCH] feat: support list in distributed FTS Change-Id: I7932b2f6620d4d10d470bf4d01ad44a817fdbf85 --- lance_ray/index.py | 5 ++++- tests/test_distributed_indexing.py | 36 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/lance_ray/index.py b/lance_ray/index.py index 76c1e21f..d8bcfc2a 100755 --- a/lance_ray/index.py +++ b/lance_ray/index.py @@ -547,7 +547,10 @@ def create_scalar_index( if isinstance(index_type, str): match index_type: case "INVERTED" | "FTS": - if not pa.types.is_string(value_type): + if not ( + pa.types.is_string(value_type) + or pa.types.is_large_string(value_type) + ): raise TypeError( f"Column {column} must be string type for {index_type} " f"index, got {value_type}" diff --git a/tests/test_distributed_indexing.py b/tests/test_distributed_indexing.py index 9a047d8f..4b6b7500 100755 --- a/tests/test_distributed_indexing.py +++ b/tests/test_distributed_indexing.py @@ -320,6 +320,42 @@ def test_build_distributed_fts_index_fts_type(self, multi_fragment_lance_dataset indices = updated_dataset.list_indices() assert len(indices) > 0, "No indices found after building" + def test_build_distributed_fts_index_list_large_utf8(self, temp_dir): + """Test distributed FTS index building on list columns.""" + search_term = "needlelarge" + table = pa.table( + { + "id": pa.array([1, 2, 3, 4], type=pa.int64()), + "tags": pa.array( + [ + ["alpha", "beta"], + ["distributed", search_term], + ["search", "fts"], + ["other", "tokens"], + ], + type=pa.list_(pa.large_string()), + ), + } + ) + dataset = ray.data.from_arrow(table) + path = Path(temp_dir) / "list_large_utf8_text.lance" + lr.write_lance(dataset, str(path), min_rows_per_file=2, max_rows_per_file=2) + + updated_dataset = lr.create_scalar_index( + uri=str(path), + column="tags", + index_type="INVERTED", + num_workers=2, + ) + + results = updated_dataset.scanner( + full_text_query=search_term, + columns=["id", "tags"], + ).to_table() + + assert results.num_rows == 1 + assert results.column("id").to_pylist() == [2] + def test_build_distributed_index_large_dataset(self, temp_dir): """Test distributed indexing on a larger dataset with multiple fragments.""" # Generate larger dataset