Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,25 @@ def test_roundtrip_types(tmp_path: Path):
assert dataset.to_table() == table


@pytest.mark.parametrize("data_storage_version", ["legacy", "stable", "2.1"])
def test_write_zero_dimension_fixed_size_list(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we at least add one test case about reading existed zero dimension by the old writer logic?

@DanielMao1 DanielMao1 Jun 15, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added as a Rust test rather than Python: once Schema::validate() rejects zero-dim FSL, no write path can produce such a dataset (and the encoder panics on zero-dim), so the fixture can't be generated by current code.
Instead, test_read_zero_dimension_fsl_errors_instead_of_panicking (in decoder.rs) drives create_structural_field_scheduler / create_legacy_field_scheduler — the read-plan builders the file reader calls against a stored schema — with a zero-dim FSL field, asserting a clean error instead of a panic.

tmp_path: Path, data_storage_version: str
):
# Zero-dimension fixed-size lists must be rejected with a clean error
# instead of a divide-by-zero panic (#5102)
schema = pa.schema(
[
pa.field("id", pa.int64()),
pa.field("vec", pa.list_(pa.float32(), 0)),
]
)
table = pa.table({"id": [1], "vec": [[]]}, schema=schema)
with pytest.raises(OSError, match="dimension must be a positive integer"):
lance.write_dataset(
table, tmp_path / "ds.lance", data_storage_version=data_storage_version
)


def test_dataset_overwrite(tmp_path: Path):
table1 = pa.Table.from_pylist([{"a": 1, "b": 2}, {"a": 10, "b": 20}])
base_dir = tmp_path / "test"
Expand Down
1 change: 1 addition & 0 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pub use field::{
pub use schema::{
BlobHandling, FieldRef, OnMissing, Projectable, Projection, Schema,
escape_field_path_for_project, format_field_path, parse_field_path,
validate_fixed_size_list_dimensions,
};

pub static BLOB_DESC_FIELDS: LazyLock<Fields> = LazyLock::new(|| {
Expand Down
90 changes: 89 additions & 1 deletion rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::{

use crate::deepsize::DeepSizeOf;
use arrow_array::RecordBatch;
use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
use lance_arrow::*;

use super::field::{Field, OnTypeMismatch, SchemaCompareOptions};
Expand Down Expand Up @@ -110,6 +110,29 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> {
}
}

/// Reject `FixedSizeList` types whose dimension is not a positive integer.
///
/// The row count of a fixed-size list is derived by dividing the number of
/// child items by the dimension, so a zero dimension panics with a
/// divide-by-zero further down the write path (see issue #5102). A
/// `FixedSizeList` of a `FixedSizeList` over a primitive collapses into a
/// single leaf field, so the pre-order field walk never visits the inner list;
/// recurse through the nested list types here to catch an inner zero dimension.
///
/// Shared by [`Schema::validate`] on the write path and the decoder's
/// field-scheduler builders on the read path.
pub fn validate_fixed_size_list_dimensions(field_name: &str, data_type: &DataType) -> Result<()> {
if let DataType::FixedSizeList(inner, dimension) = data_type {
if *dimension <= 0 {
return Err(Error::schema(format!(
"Field \"{field_name}\" contains a FixedSizeList with dimension {dimension}; dimension must be a positive integer"
)));
}
validate_fixed_size_list_dimensions(field_name, inner.data_type())?;
}
Ok(())
}

impl Schema {
/// The unenforced primary key fields in the schema, ordered by position.
///
Expand Down Expand Up @@ -346,6 +369,10 @@ impl Schema {
field.id, self
)));
}
// The row count of a fixed-size list is derived by dividing the
// number of items by the dimension, so a zero dimension would
// panic with a divide-by-zero further down the write path.
validate_fixed_size_list_dimensions(&field.name, &field.data_type())?;
}

Ok(())
Expand Down Expand Up @@ -2825,6 +2852,67 @@ mod tests {
assert!(paths.contains(&"name".to_string()));
}

#[test]
fn test_validate_rejects_zero_dimension_fixed_size_list() {
// A zero dimension divides-by-zero further down the write path (#5102)
let fsl = |dimension: i32| {
ArrowDataType::FixedSizeList(
Arc::new(ArrowField::new("item", ArrowDataType::Float32, true)),
dimension,
)
};

let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(0), true)]);
let err = Schema::try_from(&arrow_schema).unwrap_err();
assert!(
err.to_string()
.contains("dimension must be a positive integer"),
"unexpected error: {}",
err
);

// Nested inside a struct is rejected too
let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
"outer",
ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new(
"vec",
fsl(0),
true,
)])),
true,
)]);
let err = Schema::try_from(&arrow_schema).unwrap_err();
assert!(
err.to_string()
.contains("dimension must be a positive integer"),
"unexpected error: {}",
err
);

// A zero-dimension FixedSizeList nested inside a positive-dimension
// FixedSizeList collapses into a single leaf field, so the inner
// dimension is not visited by the pre-order field walk and must still
// be rejected: FixedSizeList(FixedSizeList(Float32, 0), 4).
let nested =
ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(0), true)), 4);
let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested, true)]);
let err = Schema::try_from(&arrow_schema).unwrap_err();
assert!(
err.to_string()
.contains("dimension must be a positive integer"),
"unexpected error: {}",
err
);

// A positive dimension still validates, including nested lists
let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(2), true)]);
assert!(Schema::try_from(&arrow_schema).is_ok());
let nested_ok =
ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(2), true)), 4);
let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested_ok, true)]);
assert!(Schema::try_from(&arrow_schema).is_ok());
}

#[test]
fn test_schema_unenforced_clustering_key() {
use crate::datatypes::field::LANCE_UNENFORCED_CLUSTERING_KEY_POSITION;
Expand Down
52 changes: 51 additions & 1 deletion rust/lance-encoding/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,9 @@ use futures::stream::{self, BoxStream};
use futures::{FutureExt, StreamExt};
use lance_arrow::DataTypeExt;
use lance_core::cache::LanceCache;
use lance_core::datatypes::{BLOB_DESC_LANCE_FIELD, Field, Schema};
use lance_core::datatypes::{
BLOB_DESC_LANCE_FIELD, Field, Schema, validate_fixed_size_list_dimensions,
};
use lance_core::utils::futures::{FinallyStreamExt, StreamOnDropExt};
use lance_core::utils::parse::parse_env_as_bool;
use log::{debug, trace, warn};
Expand Down Expand Up @@ -723,6 +725,7 @@ impl CoreFieldDecoderStrategy {
column_infos: &mut ColumnInfoIter,
) -> Result<Box<dyn StructuralFieldScheduler>> {
let data_type = field.data_type();
validate_fixed_size_list_dimensions(&field.name, &data_type)?;
if Self::is_structural_primitive(&data_type) {
let column_info = column_infos.expect_next()?;
let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new(
Expand Down Expand Up @@ -832,6 +835,7 @@ impl CoreFieldDecoderStrategy {
buffers: FileBuffers,
) -> Result<Box<dyn crate::previous::decoder::FieldScheduler>> {
let data_type = field.data_type();
validate_fixed_size_list_dimensions(&field.name, &data_type)?;
if Self::is_primitive_legacy(&data_type) {
let column_info = column_infos.expect_next()?;
let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?;
Expand Down Expand Up @@ -2887,6 +2891,52 @@ pub async fn decode_batch(
mod tests {
use super::*;

#[test]
fn test_read_zero_dimension_fsl_errors_instead_of_panicking() {
// Simulates reading a column whose stored schema declares a
// zero-dimension FixedSizeList, as old writers (before #5102) could
// persist. The read plan is built by the field-scheduler factories,
// which run the dimension guard before touching any column data, so
// an empty column iterator is sufficient to reach the guard. The read
// must surface a clean error rather than a divide-by-zero panic.
use arrow_schema::Field as ArrowField;

let zero_dim = DataType::FixedSizeList(
Arc::new(ArrowField::new("item", DataType::Float32, true)),
0,
);
let field = Field::try_from(&ArrowField::new("vec", zero_dim, true)).unwrap();
let strategy = CoreFieldDecoderStrategy::default();

let mut structural_columns = ColumnInfoIter::new(vec![], &[]);
let err = strategy
.create_structural_field_scheduler(&field, &mut structural_columns)
.unwrap_err();
assert!(
err.to_string()
.contains("dimension must be a positive integer"),
"unexpected error: {}",
err
);

let mut legacy_columns = ColumnInfoIter::new(vec![], &[]);
let err = strategy
.create_legacy_field_scheduler(
&field,
&mut legacy_columns,
FileBuffers {
positions_and_sizes: &[],
},
)
.unwrap_err();
assert!(
err.to_string()
.contains("dimension must be a positive integer"),
"unexpected error: {}",
err
);
}

#[test]
fn test_coalesce_indices_to_ranges_with_single_index() {
let indices = vec![1];
Expand Down
Loading