Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1461,6 +1461,39 @@ mod tests {
}
}

fn make_single_i64_ndv_stats(
distinct_count: Precision<usize>,
min_value: Option<i64>,
max_value: Option<i64>,
) -> Statistics {
let to_precision = |value| Precision::Exact(ScalarValue::Int64(Some(value)));

Statistics::default()
.with_num_rows(Precision::Exact(10))
.add_column_statistics(
ColumnStatistics::new_unknown()
.with_distinct_count(distinct_count)
.with_min_value(
min_value.map(to_precision).unwrap_or(Precision::Absent),
)
.with_max_value(
max_value.map(to_precision).unwrap_or(Precision::Absent),
),
)
}

fn merge_single_i64_ndv_distinct_count(
left: Statistics,
right: Statistics,
) -> Precision<usize> {
let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]);

Statistics::try_merge_iter([&left, &right], &schema)
.unwrap()
.column_statistics[0]
.distinct_count
}

#[test]
fn test_try_merge() {
// Create a schema with two columns
Expand Down Expand Up @@ -1906,6 +1939,185 @@ mod tests {
);
}

#[test]
fn test_try_merge_ndv_original_union_edge_cases() {
struct NdvTestCase {
name: &'static str,
left_ndv: Precision<usize>,
left_min: Option<i64>,
left_max: Option<i64>,
right_ndv: Precision<usize>,
right_min: Option<i64>,
right_max: Option<i64>,
expected: Precision<usize>,
}

let cases = vec![
NdvTestCase {
name: "disjoint ranges",
left_ndv: Precision::Exact(5),
left_min: Some(0),
left_max: Some(10),
right_ndv: Precision::Exact(3),
right_min: Some(20),
right_max: Some(30),
expected: Precision::Inexact(8),
},
NdvTestCase {
name: "identical ranges",
left_ndv: Precision::Exact(10),
left_min: Some(0),
left_max: Some(100),
right_ndv: Precision::Exact(8),
right_min: Some(0),
right_max: Some(100),
expected: Precision::Inexact(10),
},
NdvTestCase {
name: "partial overlap",
left_ndv: Precision::Exact(100),
left_min: Some(0),
left_max: Some(100),
right_ndv: Precision::Exact(50),
right_min: Some(50),
right_max: Some(150),
expected: Precision::Inexact(125),
},
NdvTestCase {
name: "right contained in left",
left_ndv: Precision::Exact(100),
left_min: Some(0),
left_max: Some(100),
right_ndv: Precision::Exact(50),
right_min: Some(25),
right_max: Some(75),
expected: Precision::Inexact(100),
},
NdvTestCase {
name: "same constant value",
left_ndv: Precision::Exact(1),
left_min: Some(5),
left_max: Some(5),
right_ndv: Precision::Exact(1),
right_min: Some(5),
right_max: Some(5),
expected: Precision::Inexact(1),
},
NdvTestCase {
name: "different constant values",
left_ndv: Precision::Exact(1),
left_min: Some(5),
left_max: Some(5),
right_ndv: Precision::Exact(1),
right_min: Some(10),
right_max: Some(10),
expected: Precision::Inexact(2),
},
NdvTestCase {
name: "left constant within right range",
left_ndv: Precision::Exact(1),
left_min: Some(5),
left_max: Some(5),
right_ndv: Precision::Exact(10),
right_min: Some(0),
right_max: Some(10),
expected: Precision::Inexact(10),
},
NdvTestCase {
name: "left constant outside right range",
left_ndv: Precision::Exact(1),
left_min: Some(20),
left_max: Some(20),
right_ndv: Precision::Exact(10),
right_min: Some(0),
right_max: Some(10),
expected: Precision::Inexact(11),
},
NdvTestCase {
name: "right constant within left range",
left_ndv: Precision::Exact(10),
left_min: Some(0),
left_max: Some(10),
right_ndv: Precision::Exact(1),
right_min: Some(5),
right_max: Some(5),
expected: Precision::Inexact(10),
},
NdvTestCase {
name: "right constant outside left range",
left_ndv: Precision::Exact(10),
left_min: Some(0),
left_max: Some(10),
right_ndv: Precision::Exact(1),
right_min: Some(20),
right_max: Some(20),
expected: Precision::Inexact(11),
},
NdvTestCase {
name: "missing bounds exact plus exact",
left_ndv: Precision::Exact(10),
left_min: None,
left_max: None,
right_ndv: Precision::Exact(5),
right_min: None,
right_max: None,
// Shared merge falls back to max without bounds.
expected: Precision::Inexact(10),
},
NdvTestCase {
name: "missing bounds exact plus inexact",
left_ndv: Precision::Exact(10),
left_min: None,
left_max: None,
right_ndv: Precision::Inexact(5),
right_min: None,
right_max: None,
// Shared merge falls back to max without bounds.
expected: Precision::Inexact(10),
},
NdvTestCase {
name: "missing bounds inexact plus inexact",
left_ndv: Precision::Inexact(7),
left_min: None,
left_max: None,
right_ndv: Precision::Inexact(3),
right_min: None,
right_max: None,
// Shared merge falls back to max without bounds.
expected: Precision::Inexact(7),
},
NdvTestCase {
name: "exact plus absent",
left_ndv: Precision::Exact(10),
left_min: None,
left_max: None,
right_ndv: Precision::Absent,
right_min: None,
right_max: None,
expected: Precision::Absent,
},
NdvTestCase {
name: "inexact plus absent",
left_ndv: Precision::Inexact(4),
left_min: None,
left_max: None,
right_ndv: Precision::Absent,
right_min: None,
right_max: None,
expected: Precision::Absent,
},
];

for case in cases {
let actual = merge_single_i64_ndv_distinct_count(
make_single_i64_ndv_stats(case.left_ndv, case.left_min, case.left_max),
make_single_i64_ndv_stats(case.right_ndv, case.right_min, case.right_max),
);

assert_eq!(actual, case.expected, "case {} failed", case.name);
}
}

#[test]
fn test_with_fetch_basic_preservation() {
// Test that column statistics and byte size are preserved (as inexact) when applying fetch
Expand Down
Loading
Loading