From 5dd9597709f2223d04074bd4de6c8f4a3629e04c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 6 Apr 2026 13:15:00 -0400 Subject: [PATCH 1/2] . --- datafusion/functions/benches/split_part.rs | 17 +++ datafusion/functions/src/string/split_part.rs | 125 ++++++++++++++++-- .../test_files/string/string_view.slt | 65 +++++++++ 3 files changed, 199 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/benches/split_part.rs b/datafusion/functions/benches/split_part.rs index 0f4998effc2ac..d578339368768 100644 --- a/datafusion/functions/benches/split_part.rs +++ b/datafusion/functions/benches/split_part.rs @@ -210,6 +210,23 @@ fn criterion_benchmark(c: &mut Criterion) { ); } + // Utf8View, very long parts (256 bytes), position 1 + { + let strings = gen_string_array(N_ROWS, 5, 256, ".", true); + let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))); + let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1))); + bench_split_part( + &mut group, + &split_part_func, + &config_options, + "scalar_utf8view_very_long_parts", + "pos_first", + strings, + delimiter, + position, + ); + } + // ── Array delimiter and position ───────────────── // Utf8, single-char delimiter, array args diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 972a10c26474e..12ccdfc429a3c 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -17,9 +17,11 @@ use crate::utils::utf8_to_str_type; use arrow::array::{ - Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType, - StringLikeArrayBuilder, StringViewBuilder, new_null_array, + Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array, + StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder, + make_view, new_null_array, }; +use arrow::buffer::ScalarBuffer; use arrow::datatypes::DataType; use datafusion_common::ScalarValue; use datafusion_common::cast::as_int64_array; @@ -279,12 +281,9 @@ fn split_part_scalar( } let result = match string_array.data_type() { - DataType::Utf8View => split_part_scalar_impl( - string_array.as_string_view(), - delimiter, - position, - StringViewBuilder::with_capacity(string_array.len()), - ), + DataType::Utf8View => { + split_part_scalar_view(string_array.as_string_view(), delimiter, position) + } DataType::Utf8 => { let arr = string_array.as_string::(); // Conservative under-estimate for data capacity: split_part output @@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>( } } +/// Zero-copy scalar fast path for `StringViewArray` inputs. +/// +/// Instead of copying substring bytes into a new buffer, constructs +/// `StringView` entries that point back into the original array's data +/// buffers. +fn split_part_scalar_view( + string_view_array: &StringViewArray, + delimiter: &str, + position: i64, +) -> Result { + let len = string_view_array.len(); + let mut views_buf = Vec::with_capacity(len); + let views = string_view_array.views(); + + if delimiter.is_empty() { + // PostgreSQL: empty delimiter treats input as a single field. + let empty_view = make_view(b"", 0, 0); + let return_input = position == 1 || position == -1; + for i in 0..len { + if string_view_array.is_null(i) { + views_buf.push(0); + } else if return_input { + views_buf.push(views[i]); + } else { + views_buf.push(empty_view); + } + } + } else if position > 0 { + let idx: usize = (position - 1).try_into().map_err(|_| { + exec_datafusion_err!( + "split_part index {position} exceeds maximum supported value" + ) + })?; + let finder = memmem::Finder::new(delimiter.as_bytes()); + split_view_loop(string_view_array, views, &mut views_buf, |s| { + split_nth_finder(s, &finder, delimiter.len(), idx) + }); + } else { + let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| { + exec_datafusion_err!( + "split_part index {position} exceeds minimum supported value" + ) + })?; + let finder_rev = memmem::FinderRev::new(delimiter.as_bytes()); + split_view_loop(string_view_array, views, &mut views_buf, |s| { + rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx) + }); + } + + let views_buf = ScalarBuffer::from(views_buf); + + // Nulls pass through unchanged, so we can use the input's null array. + let nulls = string_view_array.nulls().cloned(); + + // Safety: each view is either copied unchanged from the input, or built + // by `substr_view` from a substring that is a contiguous sub-range of the + // original string value stored in the input's data buffers. + unsafe { + Ok(Arc::new(StringViewArray::new_unchecked( + views_buf, + string_view_array.data_buffers().to_vec(), + nulls, + )) as ArrayRef) + } +} + +/// Creates a `StringView` referencing a substring of an existing view's buffer. +/// For substrings ≤ 12 bytes, creates an inline view instead. +#[inline] +fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 { + if substr.len() > 12 { + let view = ByteView::from(*original_view); + make_view( + substr.as_bytes(), + view.buffer_index, + view.offset + start_offset, + ) + } else { + make_view(substr.as_bytes(), 0, 0) + } +} + +/// Applies `split_fn` to each non-null string and appends the resulting view to +/// `views_buf`. +#[inline(always)] +fn split_view_loop( + string_view_array: &StringViewArray, + views: &[u128], + views_buf: &mut Vec, + split_fn: F, +) where + F: Fn(&str) -> Option<&str>, +{ + let empty_view = make_view(b"", 0, 0); + for (i, raw_view) in views.iter().enumerate() { + if string_view_array.is_null(i) { + views_buf.push(0); + continue; + } + let string = string_view_array.value(i); + match split_fn(string) { + Some(substr) => { + let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize; + views_buf.push(substr_view(raw_view, substr, start_offset as u32)); + } + None => views_buf.push(empty_view), + } + } +} + fn split_part_impl<'a, StringArrType, DelimiterArrType, B>( string_array: &StringArrType, delimiter_array: &DelimiterArrType, diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 086f37d6c3354..5afec8a41c9e7 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -954,6 +954,71 @@ SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2)); ---- Utf8View +# SPLIT_PART with Utf8View column (exercises the array fast path) +query T +SELECT split_part(column1_utf8view, 'ph', 1) FROM test; +---- +Andrew +Xiangpeng +Ra +(empty) +NULL + +query T +SELECT split_part(column1_utf8view, 'ph', 2) FROM test; +---- +(empty) +(empty) +ael +(empty) +NULL + +# Negative position +query T +SELECT split_part(column1_utf8view, 'ph', -1) FROM test; +---- +Andrew +Xiangpeng +ael +(empty) +NULL + +# Delimiter not found returns full string +query T +SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test; +---- +Andrew +Xiangpeng +Raphael +(empty) +NULL + +# Empty delimiter with column +query T +SELECT split_part(column1_utf8view, '', 1) FROM test; +---- +Andrew +Xiangpeng +Raphael +(empty) +NULL + +# Single-char delimiter with column +query T +SELECT split_part(column1_utf8view, 'a', 1) FROM test; +---- +Andrew +Xi +R +(empty) +NULL + +# Verify array path also returns Utf8View +query T +SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1; +---- +Utf8View + ## Ensure no casts for STRPOS query TT EXPLAIN SELECT From 2e5d10ecc119f88d91df7523119bad4e30654076 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 7 Apr 2026 14:08:20 -0400 Subject: [PATCH 2/2] Add more tests, per review --- datafusion/functions/src/string/split_part.rs | 29 ++++++++++++++++++- .../test_files/string/string_view.slt | 26 +++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 12ccdfc429a3c..1994c65bcf326 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -599,7 +599,7 @@ where #[cfg(test)] mod tests { - use arrow::array::{Array, StringArray}; + use arrow::array::{Array, AsArray, StringArray, StringViewArray}; use arrow::datatypes::DataType::Utf8; use datafusion_common::ScalarValue; @@ -795,4 +795,31 @@ mod tests { Ok(()) } + + #[test] + fn test_split_part_stringview_sliced() -> Result<()> { + use super::split_part_scalar_view; + + let strings: StringViewArray = vec![ + Some("skip_this.value"), + Some("this_is_a_long_prefix.suffix"), + Some("short.val"), + Some("another_long_result.rest"), + None, + ] + .into_iter() + .collect(); + + // Slice off the first element to get a non-zero offset array. + let sliced = strings.slice(1, 4); + let result = split_part_scalar_view(&sliced, ".", 1)?; + let result = result.as_string_view(); + assert_eq!(result.len(), 4); + assert_eq!(result.value(0), "this_is_a_long_prefix"); + assert_eq!(result.value(1), "short"); + assert_eq!(result.value(2), "another_long_result"); + assert!(result.is_null(3)); + + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 5afec8a41c9e7..126c4bcafb533 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -1019,6 +1019,32 @@ SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1; ---- Utf8View +# Long strings (>12 bytes) exercise out-of-line StringView construction in split_part +query T +SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', 1) FROM + (VALUES ('this_is_over_12.suffix'), ('short.val'), (NULL)) AS t(column1); +---- +this_is_over_12 +short +NULL + +query T +SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', -1) FROM + (VALUES ('prefix.this_is_over_12'), ('a.short'), (NULL)) AS t(column1); +---- +this_is_over_12 +short +NULL + +# Results at the 12-byte inline/out-of-line boundary +query T +SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', 1) FROM + (VALUES ('exactly12byt.rest'), ('thirteen_byte.rest'), ('twelve_bytes.rest')) AS t(column1); +---- +exactly12byt +thirteen_byte +twelve_bytes + ## Ensure no casts for STRPOS query TT EXPLAIN SELECT