-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-49753: [C++][Gandiva] Fix overflow in string functions #49813
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
3b6c9e7
72b7a85
d8beb19
7252535
de0cc36
52f6834
5513ae3
014122f
893cb12
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -213,6 +213,25 @@ int32_t gdv_fn_utf8_char_length(char c) { | |||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| static inline bool is_datalen_valid(int64_t context, int32_t data_len, int32_t* alloc_len, | ||||||
| int32_t* out_len) { | ||||||
| // Reject negative lengths | ||||||
| if (ARROW_PREDICT_FALSE(data_len < 0)) { | ||||||
| gdv_fn_context_set_error_msg(context, "Invalid (negative) data length"); | ||||||
| *out_len = 0; | ||||||
| return false; | ||||||
| } | ||||||
|
|
||||||
| // Check overflow: 2 * data_len | ||||||
| if (ARROW_PREDICT_FALSE( | ||||||
| arrow::internal::MultiplyWithOverflow(2, data_len, alloc_len))) { | ||||||
| gdv_fn_context_set_error_msg(context, "Would overflow maximum output size"); | ||||||
| *out_len = 0; | ||||||
| return false; | ||||||
| } | ||||||
| return true; | ||||||
| } | ||||||
|
|
||||||
| // Convert an utf8 string to its corresponding lowercase string | ||||||
| GANDIVA_EXPORT | ||||||
| const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len, | ||||||
|
|
@@ -222,10 +241,16 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le | |||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| int32_t alloc_length = 0; | ||||||
| if (ARROW_PREDICT_FALSE( | ||||||
| not is_datalen_valid(context, data_len, &alloc_length, out_len))) { | ||||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte | ||||||
| // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of | ||||||
| // the output can be at most twice the length of the input | ||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len)); | ||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, alloc_length)); | ||||||
| if (out == nullptr) { | ||||||
| gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); | ||||||
| *out_len = 0; | ||||||
|
|
@@ -294,10 +319,16 @@ const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_le | |||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| int32_t alloc_length = 0; | ||||||
| if (ARROW_PREDICT_FALSE( | ||||||
| not is_datalen_valid(context, data_len, &alloc_length, out_len))) { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte | ||||||
| // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of | ||||||
| // the output can be at most twice the length of the input | ||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len)); | ||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, alloc_length)); | ||||||
| if (out == nullptr) { | ||||||
| gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); | ||||||
| *out_len = 0; | ||||||
|
|
@@ -367,6 +398,15 @@ const char* gdv_fn_substring_index(int64_t context, const char* txt, int32_t txt | |||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| if (ARROW_PREDICT_FALSE(txt_len < 0)) { | ||||||
| *out_len = 0; | ||||||
| return ""; | ||||||
| } | ||||||
| if (ARROW_PREDICT_FALSE(pat_len < 0)) { | ||||||
| *out_len = 0; | ||||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, txt_len)); | ||||||
| if (out == nullptr) { | ||||||
| gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); | ||||||
|
|
@@ -445,8 +485,8 @@ const char* gdv_fn_substring_index(int64_t context, const char* txt, int32_t txt | |||||
| return out; | ||||||
|
|
||||||
| } else { | ||||||
| memcpy(out, txt, static_cast<size_t>(txt_len)); | ||||||
| *out_len = txt_len; | ||||||
| memcpy(out, txt, txt_len); | ||||||
| return out; | ||||||
| } | ||||||
| } | ||||||
|
|
@@ -480,10 +520,16 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_ | |||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| int32_t alloc_length = 0; | ||||||
| if (ARROW_PREDICT_FALSE( | ||||||
| not is_datalen_valid(context, data_len, &alloc_length, out_len))) { | ||||||
|
kou marked this conversation as resolved.
Outdated
|
||||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte | ||||||
| // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of | ||||||
| // the output can be at most twice the length of the input | ||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len)); | ||||||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, alloc_length)); | ||||||
| if (out == nullptr) { | ||||||
| gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); | ||||||
| *out_len = 0; | ||||||
|
|
@@ -579,15 +625,24 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| return in; | ||||||
| } | ||||||
|
Comment on lines
621
to
624
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kou This was not introduced with this change. Could be solved as a different issue/PR?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. Could you open a separated issue for this?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
|
|
||||||
| int32_t alloc_length = 0; | ||||||
| // Check overflow: 4 * in_len | ||||||
| if (ARROW_PREDICT_FALSE( | ||||||
| arrow::internal::MultiplyWithOverflow(4, in_len, &alloc_length))) { | ||||||
|
Comment on lines
+626
to
+629
|
||||||
| gdv_fn_context_set_error_msg(context, "Would overflow maximum output size"); | ||||||
| *out_len = 0; | ||||||
| return ""; | ||||||
| } | ||||||
|
|
||||||
| // This variable is to control if there are multi-byte utf8 entries | ||||||
| bool has_multi_byte = false; | ||||||
|
|
||||||
| // This variable is to store the final result | ||||||
| char* result; | ||||||
| int result_len; | ||||||
| int32_t result_len; | ||||||
|
|
||||||
| // Searching multi-bytes in In | ||||||
| for (int i = 0; i < in_len; i++) { | ||||||
| for (int32_t i = 0; i < in_len; i++) { | ||||||
| unsigned char char_single_byte = in[i]; | ||||||
| if (char_single_byte > 127) { | ||||||
| // found a multi-byte utf-8 char | ||||||
|
|
@@ -598,7 +653,7 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
|
|
||||||
| // Searching multi-bytes in From | ||||||
| if (!has_multi_byte) { | ||||||
| for (int i = 0; i < from_len; i++) { | ||||||
| for (int32_t i = 0; i < from_len; i++) { | ||||||
| unsigned char char_single_byte = from[i]; | ||||||
| if (char_single_byte > 127) { | ||||||
| // found a multi-byte utf-8 char | ||||||
|
|
@@ -610,7 +665,7 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
|
|
||||||
| // Searching multi-bytes in To | ||||||
| if (!has_multi_byte) { | ||||||
| for (int i = 0; i < to_len; i++) { | ||||||
| for (int32_t i = 0; i < to_len; i++) { | ||||||
| unsigned char char_single_byte = to[i]; | ||||||
| if (char_single_byte > 127) { | ||||||
| // found a multi-byte utf-8 char | ||||||
|
|
@@ -621,7 +676,7 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| } | ||||||
|
|
||||||
| // If there are no multibytes in the input, work only with char | ||||||
| if (!has_multi_byte) { | ||||||
| if (not has_multi_byte) { | ||||||
|
kou marked this conversation as resolved.
Outdated
|
||||||
| // This variable is for receive the substitutions | ||||||
| result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len)); | ||||||
|
|
||||||
|
|
@@ -638,7 +693,7 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
|
|
||||||
| // This variable is for controlling the position in entry TO, for never repeat the | ||||||
| // changes | ||||||
| int start_compare; | ||||||
| int32_t start_compare; | ||||||
|
|
||||||
| if (to_len > 0) { | ||||||
| start_compare = 0; | ||||||
|
|
@@ -650,15 +705,15 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| // list, to mark deletion positions | ||||||
| const char empty = '\0'; | ||||||
|
|
||||||
| for (int in_for = 0; in_for < in_len; in_for++) { | ||||||
| for (int32_t in_for = 0; in_for < in_len; in_for++) { | ||||||
| if (subs_list.find(in[in_for]) != subs_list.end()) { | ||||||
| if (subs_list[in[in_for]] != empty) { | ||||||
| // If exist in map, only add the correspondent value in result | ||||||
| result[result_len] = subs_list[in[in_for]]; | ||||||
| result_len++; | ||||||
| } | ||||||
| } else { | ||||||
| for (int from_for = 0; from_for <= from_len; from_for++) { | ||||||
| for (int32_t from_for = 0; from_for <= from_len; from_for++) { | ||||||
| if (from_for == from_len) { | ||||||
| // If it's not in the FROM list, just add it to the map and the result. | ||||||
| subs_list.insert(std::pair<char, char>(in[in_for], in[in_for])); | ||||||
|
|
@@ -686,10 +741,11 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| } | ||||||
| } | ||||||
| } | ||||||
| } else { // If there are no multibytes in the input, work with std::strings | ||||||
| } else { | ||||||
| // If there are multibytes in the input, work with std::strings | ||||||
| // This variable is for receive the substitutions, malloc is in_len * 4 to receive | ||||||
| // possible inputs with 4 bytes | ||||||
| result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len * 4)); | ||||||
| result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, alloc_length)); | ||||||
|
|
||||||
| if (result == nullptr) { | ||||||
| gdv_fn_context_set_error_msg(context, | ||||||
|
|
@@ -704,7 +760,7 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
|
|
||||||
| // This variable is for controlling the position in entry TO, for never repeat the | ||||||
| // changes | ||||||
| int start_compare; | ||||||
| int32_t start_compare; | ||||||
|
|
||||||
| if (to_len > 0) { | ||||||
| start_compare = 0; | ||||||
|
|
@@ -717,11 +773,11 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| const std::string empty = ""; | ||||||
|
|
||||||
| // This variables is to control len of multi-bytes entries | ||||||
| int len_char_in = 0; | ||||||
| int len_char_from = 0; | ||||||
| int len_char_to = 0; | ||||||
| int32_t len_char_in = 0; | ||||||
| int32_t len_char_from = 0; | ||||||
| int32_t len_char_to = 0; | ||||||
|
|
||||||
| for (int in_for = 0; in_for < in_len; in_for += len_char_in) { | ||||||
| for (int32_t in_for = 0; in_for < in_len; in_for += len_char_in) { | ||||||
| // Updating len to char in this position | ||||||
| len_char_in = gdv_fn_utf8_char_length(in[in_for]); | ||||||
| // Making copy to std::string with length for this char position | ||||||
|
|
@@ -734,11 +790,7 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| result_len += static_cast<int>(subs_list[insert_copy_key].length()); | ||||||
| } | ||||||
| } else { | ||||||
| for (int from_for = 0; from_for <= from_len; from_for += len_char_from) { | ||||||
| // Updating len to char in this position | ||||||
| len_char_from = gdv_fn_utf8_char_length(from[from_for]); | ||||||
| // Making copy to std::string with length for this char position | ||||||
| std::string copy_from_compare(from + from_for, len_char_from); | ||||||
| for (int32_t from_for = 0; from_for <= from_len; from_for += len_char_from) { | ||||||
| if (from_for == from_len) { | ||||||
| // If it's not in the FROM list, just add it to the map and the result. | ||||||
| std::string insert_copy_value(in + in_for, len_char_in); | ||||||
|
|
@@ -751,6 +803,11 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in | |||||
| break; | ||||||
| } | ||||||
|
|
||||||
| // Updating len to char in this position | ||||||
| len_char_from = gdv_fn_utf8_char_length(from[from_for]); | ||||||
| // Making copy to std::string with length for this char position | ||||||
| std::string copy_from_compare(from + from_for, len_char_from); | ||||||
|
|
||||||
| if (insert_copy_key != copy_from_compare) { | ||||||
| // If this character does not exist in FROM list, don't need treatment | ||||||
| continue; | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.