Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion lance_ray/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,10 @@ def create_scalar_index(
if isinstance(index_type, str):
match index_type:
case "INVERTED" | "FTS":
if not pa.types.is_string(value_type):
if not (
pa.types.is_string(value_type)
or pa.types.is_large_string(value_type)
):
raise TypeError(
f"Column {column} must be string type for {index_type} "
f"index, got {value_type}"
Expand Down
36 changes: 36 additions & 0 deletions tests/test_distributed_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,42 @@ def test_build_distributed_fts_index_fts_type(self, multi_fragment_lance_dataset
indices = updated_dataset.list_indices()
assert len(indices) > 0, "No indices found after building"

def test_build_distributed_fts_index_list_large_utf8(self, temp_dir):
"""Test distributed FTS index building on list<large_utf8> columns."""
search_term = "needlelarge"
table = pa.table(
{
"id": pa.array([1, 2, 3, 4], type=pa.int64()),
"tags": pa.array(
[
["alpha", "beta"],
["distributed", search_term],
["search", "fts"],
["other", "tokens"],
],
type=pa.list_(pa.large_string()),
),
}
)
dataset = ray.data.from_arrow(table)
path = Path(temp_dir) / "list_large_utf8_text.lance"
lr.write_lance(dataset, str(path), min_rows_per_file=2, max_rows_per_file=2)

updated_dataset = lr.create_scalar_index(
uri=str(path),
column="tags",
index_type="INVERTED",
num_workers=2,
)

results = updated_dataset.scanner(
full_text_query=search_term,
columns=["id", "tags"],
).to_table()

assert results.num_rows == 1
assert results.column("id").to_pylist() == [2]

def test_build_distributed_index_large_dataset(self, temp_dir):
"""Test distributed indexing on a larger dataset with multiple fragments."""
# Generate larger dataset
Expand Down
Loading