From d758cd1542ea03a7408cec53502dc3a947eeae4a Mon Sep 17 00:00:00 2001 From: Daniel Mao Date: Fri, 12 Jun 2026 17:05:42 +0800 Subject: [PATCH] fix: return error instead of panicking on zero-dimension fixed-size-list columns A fixed-size-list column with dimension 0 used to panic with 'attempt to divide by zero' (rust/lance-encoding/src/data.rs) on every write path and when reading datasets persisted by older writers. Reject such columns with a clean schema error, following the maintainer guidance in #5102 (error, not panic). A shared helper `validate_fixed_size_list_dimensions` (in lance-core) is used on both paths: - Write path: called from Schema::validate(), which runs inside Schema::try_from(&ArrowSchema), so all write entry points are covered. - Read path: called from the decoder field-scheduler builders, so datasets persisted by old writers fail cleanly instead of panicking at scheduling time. A FixedSizeList of a FixedSizeList over a primitive collapses into a single leaf field, so the helper recurses through nested list types to also reject an inner zero dimension, e.g. FixedSizeList(FixedSizeList(Float32, 0), 4). Closes #5102 --- python/python/tests/test_dataset.py | 19 ++++++ rust/lance-core/src/datatypes.rs | 1 + rust/lance-core/src/datatypes/schema.rs | 90 ++++++++++++++++++++++++- rust/lance-encoding/src/decoder.rs | 52 +++++++++++++- 4 files changed, 160 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 39dac98aec6..45866f3c4da 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -93,6 +93,25 @@ def test_roundtrip_types(tmp_path: Path): assert dataset.to_table() == table +@pytest.mark.parametrize("data_storage_version", ["legacy", "stable", "2.1"]) +def test_write_zero_dimension_fixed_size_list( + tmp_path: Path, data_storage_version: str +): + # Zero-dimension fixed-size lists must be rejected with a clean error + # instead of a divide-by-zero panic (#5102) + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("vec", pa.list_(pa.float32(), 0)), + ] + ) + table = pa.table({"id": [1], "vec": [[]]}, schema=schema) + with pytest.raises(OSError, match="dimension must be a positive integer"): + lance.write_dataset( + table, tmp_path / "ds.lance", data_storage_version=data_storage_version + ) + + def test_dataset_overwrite(tmp_path: Path): table1 = pa.Table.from_pylist([{"a": 1, "b": 2}, {"a": 10, "b": 20}]) base_dir = tmp_path / "test" diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 628f9cf9a90..8837037c308 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -25,6 +25,7 @@ pub use field::{ pub use schema::{ BlobHandling, FieldRef, OnMissing, Projectable, Projection, Schema, escape_field_path_for_project, format_field_path, parse_field_path, + validate_fixed_size_list_dimensions, }; pub static BLOB_DESC_FIELDS: LazyLock = LazyLock::new(|| { diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index f959c37672f..d13eb476359 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -11,7 +11,7 @@ use std::{ use crate::deepsize::DeepSizeOf; use arrow_array::RecordBatch; -use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; +use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_arrow::*; use super::field::{Field, OnTypeMismatch, SchemaCompareOptions}; @@ -110,6 +110,29 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } } +/// Reject `FixedSizeList` types whose dimension is not a positive integer. +/// +/// The row count of a fixed-size list is derived by dividing the number of +/// child items by the dimension, so a zero dimension panics with a +/// divide-by-zero further down the write path (see issue #5102). A +/// `FixedSizeList` of a `FixedSizeList` over a primitive collapses into a +/// single leaf field, so the pre-order field walk never visits the inner list; +/// recurse through the nested list types here to catch an inner zero dimension. +/// +/// Shared by [`Schema::validate`] on the write path and the decoder's +/// field-scheduler builders on the read path. +pub fn validate_fixed_size_list_dimensions(field_name: &str, data_type: &DataType) -> Result<()> { + if let DataType::FixedSizeList(inner, dimension) = data_type { + if *dimension <= 0 { + return Err(Error::schema(format!( + "Field \"{field_name}\" contains a FixedSizeList with dimension {dimension}; dimension must be a positive integer" + ))); + } + validate_fixed_size_list_dimensions(field_name, inner.data_type())?; + } + Ok(()) +} + impl Schema { /// The unenforced primary key fields in the schema, ordered by position. /// @@ -346,6 +369,10 @@ impl Schema { field.id, self ))); } + // The row count of a fixed-size list is derived by dividing the + // number of items by the dimension, so a zero dimension would + // panic with a divide-by-zero further down the write path. + validate_fixed_size_list_dimensions(&field.name, &field.data_type())?; } Ok(()) @@ -2825,6 +2852,67 @@ mod tests { assert!(paths.contains(&"name".to_string())); } + #[test] + fn test_validate_rejects_zero_dimension_fixed_size_list() { + // A zero dimension divides-by-zero further down the write path (#5102) + let fsl = |dimension: i32| { + ArrowDataType::FixedSizeList( + Arc::new(ArrowField::new("item", ArrowDataType::Float32, true)), + dimension, + ) + }; + + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(0), true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // Nested inside a struct is rejected too + let arrow_schema = ArrowSchema::new(vec![ArrowField::new( + "outer", + ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new( + "vec", + fsl(0), + true, + )])), + true, + )]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A zero-dimension FixedSizeList nested inside a positive-dimension + // FixedSizeList collapses into a single leaf field, so the inner + // dimension is not visited by the pre-order field walk and must still + // be rejected: FixedSizeList(FixedSizeList(Float32, 0), 4). + let nested = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(0), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested, true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A positive dimension still validates, including nested lists + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(2), true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + let nested_ok = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(2), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested_ok, true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + } + #[test] fn test_schema_unenforced_clustering_key() { use crate::datatypes::field::LANCE_UNENFORCED_CLUSTERING_KEY_POSITION; diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index 59886d337d1..a30d5ed93a9 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -226,7 +226,9 @@ use futures::stream::{self, BoxStream}; use futures::{FutureExt, StreamExt}; use lance_arrow::DataTypeExt; use lance_core::cache::LanceCache; -use lance_core::datatypes::{BLOB_DESC_LANCE_FIELD, Field, Schema}; +use lance_core::datatypes::{ + BLOB_DESC_LANCE_FIELD, Field, Schema, validate_fixed_size_list_dimensions, +}; use lance_core::utils::futures::{FinallyStreamExt, StreamOnDropExt}; use lance_core::utils::parse::parse_env_as_bool; use log::{debug, trace, warn}; @@ -723,6 +725,7 @@ impl CoreFieldDecoderStrategy { column_infos: &mut ColumnInfoIter, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_structural_primitive(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new( @@ -832,6 +835,7 @@ impl CoreFieldDecoderStrategy { buffers: FileBuffers, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_primitive_legacy(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?; @@ -2887,6 +2891,52 @@ pub async fn decode_batch( mod tests { use super::*; + #[test] + fn test_read_zero_dimension_fsl_errors_instead_of_panicking() { + // Simulates reading a column whose stored schema declares a + // zero-dimension FixedSizeList, as old writers (before #5102) could + // persist. The read plan is built by the field-scheduler factories, + // which run the dimension guard before touching any column data, so + // an empty column iterator is sufficient to reach the guard. The read + // must surface a clean error rather than a divide-by-zero panic. + use arrow_schema::Field as ArrowField; + + let zero_dim = DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 0, + ); + let field = Field::try_from(&ArrowField::new("vec", zero_dim, true)).unwrap(); + let strategy = CoreFieldDecoderStrategy::default(); + + let mut structural_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_structural_field_scheduler(&field, &mut structural_columns) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + let mut legacy_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_legacy_field_scheduler( + &field, + &mut legacy_columns, + FileBuffers { + positions_and_sizes: &[], + }, + ) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + } + #[test] fn test_coalesce_indices_to_ranges_with_single_index() { let indices = vec![1];