-
Notifications
You must be signed in to change notification settings - Fork 2k
fix: Use codepoints in lpad, rpad, translate
#21405
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
de35dcf
2919ce7
c63800a
82d8128
488b2bf
7232c34
8a17d52
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,6 @@ use arrow::array::{ | |
| OffsetSizeTrait, StringArrayType, StringViewArray, | ||
| }; | ||
| use arrow::datatypes::DataType; | ||
| use unicode_segmentation::UnicodeSegmentation; | ||
|
|
||
| use crate::utils::{make_scalar_function, utf8_to_str_type}; | ||
| use datafusion_common::cast::as_int64_array; | ||
|
|
@@ -178,7 +177,7 @@ impl ScalarUDFImpl for RPadFunc { | |
| } | ||
| } | ||
|
|
||
| use super::common::{try_as_scalar_i64, try_as_scalar_str}; | ||
| use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str}; | ||
|
|
||
| /// Optimized rpad for constant target_len and fill arguments. | ||
| fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( | ||
|
|
@@ -271,22 +270,19 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( | |
| let data_capacity = string_array.len().saturating_mul(target_len * 4); | ||
| let mut builder = | ||
| GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity); | ||
| let mut graphemes_buf = Vec::new(); | ||
|
|
||
| for maybe_string in string_array.iter() { | ||
| match maybe_string { | ||
| Some(string) => { | ||
| graphemes_buf.clear(); | ||
| graphemes_buf.extend(string.graphemes(true)); | ||
| let char_count = string.chars().count(); | ||
|
|
||
| if target_len < graphemes_buf.len() { | ||
| let end: usize = | ||
| graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); | ||
| builder.append_value(&string[..end]); | ||
| if target_len < char_count { | ||
| builder | ||
| .append_value(&string[..byte_offset_of_char(string, target_len)]); | ||
neilconway marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } else if fill_chars.is_empty() { | ||
| builder.append_value(string); | ||
| } else { | ||
| let pad_chars = target_len - graphemes_buf.len(); | ||
| let pad_chars = target_len - char_count; | ||
| let pad_bytes = char_byte_offsets[pad_chars]; | ||
| builder.write_str(string)?; | ||
| builder.write_str(&padding_buf[..pad_bytes])?; | ||
|
|
@@ -377,7 +373,6 @@ where | |
| { | ||
| let array = if let Some(fill_array) = fill_array { | ||
| let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new(); | ||
| let mut graphemes_buf = Vec::new(); | ||
| let mut fill_chars_buf = Vec::new(); | ||
|
|
||
| for ((string, target_len), fill) in string_array | ||
|
|
@@ -406,8 +401,7 @@ where | |
| } | ||
|
|
||
| if string.is_ascii() && fill.is_ascii() { | ||
| // ASCII fast path: byte length == character length, | ||
| // so we skip expensive grapheme segmentation. | ||
| // ASCII fast path: byte length == character length. | ||
| let str_len = string.len(); | ||
| if target_len < str_len { | ||
| builder.append_value(&string[..target_len]); | ||
|
|
@@ -428,21 +422,20 @@ where | |
| builder.append_value(""); | ||
| } | ||
| } else { | ||
| graphemes_buf.clear(); | ||
| graphemes_buf.extend(string.graphemes(true)); | ||
| let char_count = string.chars().count(); | ||
|
|
||
| fill_chars_buf.clear(); | ||
| fill_chars_buf.extend(fill.chars()); | ||
|
|
||
| if target_len < graphemes_buf.len() { | ||
| let end: usize = | ||
| graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); | ||
| builder.append_value(&string[..end]); | ||
| if target_len < char_count { | ||
| builder.append_value( | ||
| &string[..byte_offset_of_char(string, target_len)], | ||
| ); | ||
| } else if fill_chars_buf.is_empty() { | ||
| builder.append_value(string); | ||
| } else { | ||
| builder.write_str(string)?; | ||
| for l in 0..target_len - graphemes_buf.len() { | ||
| for l in 0..target_len - char_count { | ||
| let c = | ||
| *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); | ||
| builder.write_char(c)?; | ||
|
|
@@ -458,7 +451,6 @@ where | |
| builder.finish() | ||
| } else { | ||
| let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new(); | ||
| let mut graphemes_buf = Vec::new(); | ||
|
|
||
| for (string, target_len) in string_array.iter().zip(length_array.iter()) { | ||
| if let (Some(string), Some(target_len)) = (string, target_len) { | ||
|
|
@@ -492,16 +484,15 @@ where | |
| builder.append_value(""); | ||
| } | ||
| } else { | ||
| graphemes_buf.clear(); | ||
| graphemes_buf.extend(string.graphemes(true)); | ||
| let char_count = string.chars().count(); | ||
|
|
||
| if target_len < graphemes_buf.len() { | ||
| let end: usize = | ||
| graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); | ||
| builder.append_value(&string[..end]); | ||
| if target_len < char_count { | ||
|
||
| builder.append_value( | ||
| &string[..byte_offset_of_char(string, target_len)], | ||
| ); | ||
| } else { | ||
| builder.write_str(string)?; | ||
| for _ in 0..(target_len - graphemes_buf.len()) { | ||
| for _ in 0..(target_len - char_count) { | ||
|
||
| builder.write_str(" ")?; | ||
| } | ||
| builder.append_value(""); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both
byte_offset_of_char(string, ...)andlet char_count = string.chars().count();iterate over the chars instring.It could be optimized to iterate just once with something like:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea! I added a helper here to do this in one place and DRY up the code a bit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I posted updated benchmark results, this is a nice win.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Awesome!