Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ regex_expressions = ["regex"]
# enable string functions
string_expressions = ["uuid"]
# enable unicode functions
unicode_expressions = ["unicode-segmentation"]
unicode_expressions = []

[lib]
name = "datafusion_functions"
Expand Down Expand Up @@ -87,7 +87,6 @@ num-traits = { workspace = true }
rand = { workspace = true }
regex = { workspace = true, optional = true }
sha2 = { workspace = true, optional = true }
unicode-segmentation = { version = "^1.13.2", optional = true }
uuid = { workspace = true, features = ["v4"], optional = true }

[dev-dependencies]
Expand Down
18 changes: 13 additions & 5 deletions datafusion/functions/src/unicode/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ impl LeftRightSlicer for RightSlicer {
}
}

/// Returns the byte offset of the `n`th codepoint in `string`, or
/// `string.len()` if the string has fewer than `n` codepoints.
#[inline]
pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
string
.char_indices()
.nth(n)
.map_or(string.len(), |(i, _)| i)
}

/// Calculate the byte length of the substring of `n` chars from string `string`
#[inline]
fn left_right_byte_length(string: &str, n: i64) -> usize {
Expand All @@ -88,11 +98,9 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
.map(|(index, _)| index)
.unwrap_or(0),
Ordering::Equal => 0,
Ordering::Greater => string
.char_indices()
.nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
.map(|(index, _)| index)
.unwrap_or(string.len()),
Ordering::Greater => {
byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
}
}
}

Expand Down
51 changes: 20 additions & 31 deletions datafusion/functions/src/unicode/lpad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ use arrow::array::{
OffsetSizeTrait, StringArrayType, StringViewArray,
};
use arrow::datatypes::DataType;
use unicode_segmentation::UnicodeSegmentation;

use crate::utils::{make_scalar_function, utf8_to_str_type};
use datafusion_common::cast::as_int64_array;
Expand Down Expand Up @@ -178,7 +177,7 @@ impl ScalarUDFImpl for LPadFunc {
}
}

use super::common::{try_as_scalar_i64, try_as_scalar_str};
use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};

/// Optimized lpad for constant target_len and fill arguments.
fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
Expand Down Expand Up @@ -270,22 +269,19 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
let data_capacity = string_array.len().saturating_mul(target_len * 4);
let mut builder =
GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
let mut graphemes_buf = Vec::new();

for maybe_string in string_array.iter() {
match maybe_string {
Some(string) => {
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));
let char_count = string.chars().count();

if target_len < graphemes_buf.len() {
let end: usize =
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
builder.append_value(&string[..end]);
if target_len < char_count {
builder
.append_value(&string[..byte_offset_of_char(string, target_len)]);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both byte_offset_of_char(string, ...) and let char_count = string.chars().count(); iterate over the chars in string.
It could be optimized to iterate just once with something like:

let mut char_count = 0;
let mut truncate_offset = None;
for (i, (byte_idx, _)) in string.char_indices().enumerate() {
    if i == target_len {
        truncate_offset = Some(byte_idx);
    }
    char_count += 1;
}
if let Some(offset) = truncate_offset {
    builder.append_value(&string[..offset]);
    ... 

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! I added a helper here to do this in one place and DRY up the code a bit.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I posted updated benchmark results, this is a nice win.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!

} else if fill_chars.is_empty() {
builder.append_value(string);
} else {
let pad_chars = target_len - graphemes_buf.len();
let pad_chars = target_len - char_count;
let pad_bytes = char_byte_offsets[pad_chars];
builder.write_str(&padding_buf[..pad_bytes])?;
builder.append_value(string);
Expand Down Expand Up @@ -378,7 +374,6 @@ where
{
let array = if let Some(fill_array) = fill_array {
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
let mut graphemes_buf = Vec::new();
let mut fill_chars_buf = Vec::new();

for ((string, target_len), fill) in string_array
Expand Down Expand Up @@ -407,8 +402,7 @@ where
}

if string.is_ascii() && fill.is_ascii() {
// ASCII fast path: byte length == character length,
// so we skip expensive grapheme segmentation.
// ASCII fast path: byte length == character length.
let str_len = string.len();
if target_len < str_len {
builder.append_value(&string[..target_len]);
Expand All @@ -428,21 +422,19 @@ where
builder.append_value(string);
}
} else {
// Reuse buffers by clearing and refilling
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));
let char_count = string.chars().count();

fill_chars_buf.clear();
fill_chars_buf.extend(fill.chars());

if target_len < graphemes_buf.len() {
let end: usize =
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
builder.append_value(&string[..end]);
if target_len < char_count {
builder.append_value(
&string[..byte_offset_of_char(string, target_len)],
);
} else if fill_chars_buf.is_empty() {
builder.append_value(string);
} else {
for l in 0..target_len - graphemes_buf.len() {
for l in 0..target_len - char_count {
let c =
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
builder.write_char(c)?;
Expand All @@ -458,7 +450,6 @@ where
builder.finish()
} else {
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
let mut graphemes_buf = Vec::new();

for (string, target_len) in string_array.iter().zip(length_array.iter()) {
if let (Some(string), Some(target_len)) = (string, target_len) {
Expand Down Expand Up @@ -491,16 +482,14 @@ where
builder.append_value(string);
}
} else {
// Reuse buffer by clearing and refilling
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));

if target_len < graphemes_buf.len() {
let end: usize =
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
builder.append_value(&string[..end]);
let char_count = string.chars().count();

if target_len < char_count {
builder.append_value(
&string[..byte_offset_of_char(string, target_len)],
);
} else {
for _ in 0..(target_len - graphemes_buf.len()) {
for _ in 0..(target_len - char_count) {
builder.write_str(" ")?;
}
builder.append_value(string);
Expand Down
47 changes: 19 additions & 28 deletions datafusion/functions/src/unicode/rpad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ use arrow::array::{
OffsetSizeTrait, StringArrayType, StringViewArray,
};
use arrow::datatypes::DataType;
use unicode_segmentation::UnicodeSegmentation;

use crate::utils::{make_scalar_function, utf8_to_str_type};
use datafusion_common::cast::as_int64_array;
Expand Down Expand Up @@ -178,7 +177,7 @@ impl ScalarUDFImpl for RPadFunc {
}
}

use super::common::{try_as_scalar_i64, try_as_scalar_str};
use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};

/// Optimized rpad for constant target_len and fill arguments.
fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
Expand Down Expand Up @@ -271,22 +270,19 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
let data_capacity = string_array.len().saturating_mul(target_len * 4);
let mut builder =
GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
let mut graphemes_buf = Vec::new();

for maybe_string in string_array.iter() {
match maybe_string {
Some(string) => {
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));
let char_count = string.chars().count();

if target_len < graphemes_buf.len() {
let end: usize =
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
builder.append_value(&string[..end]);
if target_len < char_count {
builder
.append_value(&string[..byte_offset_of_char(string, target_len)]);
} else if fill_chars.is_empty() {
builder.append_value(string);
} else {
let pad_chars = target_len - graphemes_buf.len();
let pad_chars = target_len - char_count;
let pad_bytes = char_byte_offsets[pad_chars];
builder.write_str(string)?;
builder.write_str(&padding_buf[..pad_bytes])?;
Expand Down Expand Up @@ -377,7 +373,6 @@ where
{
let array = if let Some(fill_array) = fill_array {
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
let mut graphemes_buf = Vec::new();
let mut fill_chars_buf = Vec::new();

for ((string, target_len), fill) in string_array
Expand Down Expand Up @@ -406,8 +401,7 @@ where
}

if string.is_ascii() && fill.is_ascii() {
// ASCII fast path: byte length == character length,
// so we skip expensive grapheme segmentation.
// ASCII fast path: byte length == character length.
let str_len = string.len();
if target_len < str_len {
builder.append_value(&string[..target_len]);
Expand All @@ -428,21 +422,20 @@ where
builder.append_value("");
}
} else {
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));
let char_count = string.chars().count();

fill_chars_buf.clear();
fill_chars_buf.extend(fill.chars());

if target_len < graphemes_buf.len() {
let end: usize =
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
builder.append_value(&string[..end]);
if target_len < char_count {
builder.append_value(
&string[..byte_offset_of_char(string, target_len)],
);
} else if fill_chars_buf.is_empty() {
builder.append_value(string);
} else {
builder.write_str(string)?;
for l in 0..target_len - graphemes_buf.len() {
for l in 0..target_len - char_count {
let c =
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
builder.write_char(c)?;
Expand All @@ -458,7 +451,6 @@ where
builder.finish()
} else {
let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
let mut graphemes_buf = Vec::new();

for (string, target_len) in string_array.iter().zip(length_array.iter()) {
if let (Some(string), Some(target_len)) = (string, target_len) {
Expand Down Expand Up @@ -492,16 +484,15 @@ where
builder.append_value("");
}
} else {
graphemes_buf.clear();
graphemes_buf.extend(string.graphemes(true));
let char_count = string.chars().count();

if target_len < graphemes_buf.len() {
let end: usize =
graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
builder.append_value(&string[..end]);
if target_len < char_count {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would <= not be correct here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I refactored the code here as part of some improvements @martin-g suggested, so I think this is moot now.

builder.append_value(
&string[..byte_offset_of_char(string, target_len)],
);
} else {
builder.write_str(string)?;
for _ in 0..(target_len - graphemes_buf.len()) {
for _ in 0..(target_len - char_count) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

couldn't this use some repeat extend or something?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could, although this is just what the pre-existing code did. I think repeat would allocate and write_str in a loop will not; not sure offhand which is better but I'd think it's a wash? We could look at this in a followup PR if you'd like?

builder.write_str(" ")?;
}
builder.append_value("");
Expand Down
Loading
Loading