Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions datafusion/functions/benches/split_part.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,23 @@ fn criterion_benchmark(c: &mut Criterion) {
);
}

// Utf8View, very long parts (256 bytes), position 1
{
let strings = gen_string_array(N_ROWS, 5, 256, ".", true);
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
bench_split_part(
&mut group,
&split_part_func,
&config_options,
"scalar_utf8view_very_long_parts",
"pos_first",
strings,
delimiter,
position,
);
}

// ── Array delimiter and position ─────────────────

// Utf8, single-char delimiter, array args
Expand Down
125 changes: 117 additions & 8 deletions datafusion/functions/src/string/split_part.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

use crate::utils::utf8_to_str_type;
use arrow::array::{
Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType,
StringLikeArrayBuilder, StringViewBuilder, new_null_array,
Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array,
StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder,
make_view, new_null_array,
};
use arrow::buffer::ScalarBuffer;
use arrow::datatypes::DataType;
use datafusion_common::ScalarValue;
use datafusion_common::cast::as_int64_array;
Expand Down Expand Up @@ -279,12 +281,9 @@ fn split_part_scalar(
}

let result = match string_array.data_type() {
DataType::Utf8View => split_part_scalar_impl(
string_array.as_string_view(),
delimiter,
position,
StringViewBuilder::with_capacity(string_array.len()),
),
DataType::Utf8View => {
split_part_scalar_view(string_array.as_string_view(), delimiter, position)
}
DataType::Utf8 => {
let arr = string_array.as_string::<i32>();
// Conservative under-estimate for data capacity: split_part output
Expand Down Expand Up @@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>(
}
}

/// Zero-copy scalar fast path for `StringViewArray` inputs.
///
/// Instead of copying substring bytes into a new buffer, constructs
/// `StringView` entries that point back into the original array's data
/// buffers.
fn split_part_scalar_view(
string_view_array: &StringViewArray,
delimiter: &str,
position: i64,
) -> Result<ArrayRef> {
let len = string_view_array.len();
let mut views_buf = Vec::with_capacity(len);
let views = string_view_array.views();

if delimiter.is_empty() {
// PostgreSQL: empty delimiter treats input as a single field.
let empty_view = make_view(b"", 0, 0);
let return_input = position == 1 || position == -1;
for i in 0..len {
if string_view_array.is_null(i) {
views_buf.push(0);
} else if return_input {
views_buf.push(views[i]);
} else {
views_buf.push(empty_view);
}
}
} else if position > 0 {
let idx: usize = (position - 1).try_into().map_err(|_| {
exec_datafusion_err!(
"split_part index {position} exceeds maximum supported value"
)
})?;
let finder = memmem::Finder::new(delimiter.as_bytes());
split_view_loop(string_view_array, views, &mut views_buf, |s| {
split_nth_finder(s, &finder, delimiter.len(), idx)
});
} else {
let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

upstream guarantees position will not be zero, so this looks safe.

exec_datafusion_err!(
"split_part index {position} exceeds minimum supported value"
)
})?;
let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
split_view_loop(string_view_array, views, &mut views_buf, |s| {
rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx)
});
}

let views_buf = ScalarBuffer::from(views_buf);

// Nulls pass through unchanged, so we can use the input's null array.
let nulls = string_view_array.nulls().cloned();

// Safety: each view is either copied unchanged from the input, or built
// by `substr_view` from a substring that is a contiguous sub-range of the
// original string value stored in the input's data buffers.
unsafe {
Ok(Arc::new(StringViewArray::new_unchecked(
views_buf,
string_view_array.data_buffers().to_vec(),
nulls,
)) as ArrayRef)
}
}

/// Creates a `StringView` referencing a substring of an existing view's buffer.
/// For substrings ≤ 12 bytes, creates an inline view instead.
#[inline]
fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
if substr.len() > 12 {
let view = ByteView::from(*original_view);
make_view(
substr.as_bytes(),
view.buffer_index,
view.offset + start_offset,
)
} else {
make_view(substr.as_bytes(), 0, 0)
}
}

/// Applies `split_fn` to each non-null string and appends the resulting view to
/// `views_buf`.
#[inline(always)]
fn split_view_loop<F>(
string_view_array: &StringViewArray,
views: &[u128],
views_buf: &mut Vec<u128>,
split_fn: F,
) where
F: Fn(&str) -> Option<&str>,
{
let empty_view = make_view(b"", 0, 0);
for (i, raw_view) in views.iter().enumerate() {
if string_view_array.is_null(i) {
views_buf.push(0);
continue;
}
let string = string_view_array.value(i);
match split_fn(string) {
Some(substr) => {
let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
views_buf.push(substr_view(raw_view, substr, start_offset as u32));
}
None => views_buf.push(empty_view),
}
}
}

fn split_part_impl<'a, StringArrType, DelimiterArrType, B>(
string_array: &StringArrType,
delimiter_array: &DelimiterArrType,
Expand Down
65 changes: 65 additions & 0 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,71 @@ SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2));
----
Utf8View

# SPLIT_PART with Utf8View column (exercises the array fast path)
query T
SELECT split_part(column1_utf8view, 'ph', 1) FROM test;
----
Andrew
Xiangpeng
Ra
(empty)
NULL

query T
SELECT split_part(column1_utf8view, 'ph', 2) FROM test;
----
(empty)
(empty)
ael
(empty)
NULL

# Negative position
query T
SELECT split_part(column1_utf8view, 'ph', -1) FROM test;
----
Andrew
Xiangpeng
ael
(empty)
NULL

# Delimiter not found returns full string
query T
SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test;
----
Andrew
Xiangpeng
Raphael
(empty)
NULL

# Empty delimiter with column
query T
SELECT split_part(column1_utf8view, '', 1) FROM test;
----
Andrew
Xiangpeng
Raphael
(empty)
NULL

# Single-char delimiter with column
query T
SELECT split_part(column1_utf8view, 'a', 1) FROM test;
----
Andrew
Xi
R
(empty)
NULL

# Verify array path also returns Utf8View
query T
SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1;
----
Utf8View

## Ensure no casts for STRPOS
query TT
EXPLAIN SELECT
Expand Down
Loading