diff --git a/cpp/arcticdb/CMakeLists.txt b/cpp/arcticdb/CMakeLists.txt index a19e58f8c7c..d952242de5a 100644 --- a/cpp/arcticdb/CMakeLists.txt +++ b/cpp/arcticdb/CMakeLists.txt @@ -514,6 +514,7 @@ set(arcticdb_srcs processing/operation_dispatch_binary_operator_minus.cpp processing/operation_dispatch_binary_operator_times.cpp processing/operation_dispatch_binary_operator_divide.cpp + processing/operation_dispatch_binary_operator_mod.cpp processing/operation_dispatch_ternary.cpp processing/query_planner.cpp processing/sorted_aggregation.cpp diff --git a/cpp/arcticdb/processing/expression_node.cpp b/cpp/arcticdb/processing/expression_node.cpp index ef8f10f3bf8..258d99c16f3 100644 --- a/cpp/arcticdb/processing/expression_node.cpp +++ b/cpp/arcticdb/processing/expression_node.cpp @@ -172,6 +172,7 @@ std::variant ExpressionNode::compute( case OperationType::SUB: case OperationType::MUL: case OperationType::DIV: + case OperationType::MOD: user_input::check( std::holds_alternative(left_type), "Unexpected bitset input as left operand to {}", @@ -226,6 +227,14 @@ std::variant ExpressionNode::compute( res = data_type_from_raw_type(); break; } + case OperationType::MOD: { + using TargetType = typename binary_operation_promoted_type< + typename left_type_info::RawType, + typename right_type_info::RawType, + std::remove_reference_t>::type; + res = data_type_from_raw_type(); + break; + } default: internal::raise("Unexpected binary operator"); } diff --git a/cpp/arcticdb/processing/operation_dispatch_binary.cpp b/cpp/arcticdb/processing/operation_dispatch_binary.cpp index 586a3eef8a3..51ee2b23987 100644 --- a/cpp/arcticdb/processing/operation_dispatch_binary.cpp +++ b/cpp/arcticdb/processing/operation_dispatch_binary.cpp @@ -143,6 +143,8 @@ VariantData dispatch_binary(const VariantData& left, const VariantData& right, O return visit_binary_operator(left, right, TimesOperator{}); case OperationType::DIV: return visit_binary_operator(left, right, DivideOperator{}); + case OperationType::MOD: + return visit_binary_operator(left, right, ModOperator{}); case OperationType::EQ: return visit_binary_comparator(left, right, EqualsOperator{}); case OperationType::NE: diff --git a/cpp/arcticdb/processing/operation_dispatch_binary.hpp b/cpp/arcticdb/processing/operation_dispatch_binary.hpp index 02362a91fe8..faebcb90243 100644 --- a/cpp/arcticdb/processing/operation_dispatch_binary.hpp +++ b/cpp/arcticdb/processing/operation_dispatch_binary.hpp @@ -627,6 +627,8 @@ extern template VariantData visit_binary_operator< arcticdb::TimesOperator>(const VariantData&, const VariantData&, TimesOperator&&); extern template VariantData visit_binary_operator< arcticdb::DivideOperator>(const VariantData&, const VariantData&, DivideOperator&&); +extern template VariantData visit_binary_operator< + arcticdb::ModOperator>(const VariantData&, const VariantData&, ModOperator&&); // instantiated in operation_dispatch_binary_comparator.cpp to reduce compilation memory use extern template VariantData visit_binary_comparator< diff --git a/cpp/arcticdb/processing/operation_dispatch_binary_operator_mod.cpp b/cpp/arcticdb/processing/operation_dispatch_binary_operator_mod.cpp new file mode 100644 index 00000000000..af79193f4fe --- /dev/null +++ b/cpp/arcticdb/processing/operation_dispatch_binary_operator_mod.cpp @@ -0,0 +1,13 @@ +/* + * Copyright 2026 Man Group Operations Limited + * + * Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + * + * As of the Change Date specified in that file, in accordance with the Business Source License, use of this software + * will be governed by the Apache License, version 2.0. + */ +#include + +namespace arcticdb { +template VariantData visit_binary_operator(const VariantData&, const VariantData&, ModOperator&&); +} diff --git a/cpp/arcticdb/processing/operation_types.hpp b/cpp/arcticdb/processing/operation_types.hpp index 9b03a109a33..fcd84cf0082 100644 --- a/cpp/arcticdb/processing/operation_types.hpp +++ b/cpp/arcticdb/processing/operation_types.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -37,6 +38,7 @@ enum class OperationType : uint8_t { SUB, MUL, DIV, + MOD, // Comparison EQ, NE, @@ -70,6 +72,7 @@ inline std::string_view operation_type_to_str(const OperationType ot) { TO_STR(SUB) TO_STR(MUL) TO_STR(DIV) + TO_STR(MOD) TO_STR(EQ) TO_STR(NE) TO_STR(LT) @@ -103,6 +106,7 @@ struct PlusOperator; struct MinusOperator; struct TimesOperator; struct DivideOperator; +struct ModOperator; struct MembershipOperator; namespace arithmetic_promoted_type::details { @@ -230,6 +234,24 @@ struct binary_operation_promoted_type { 2 * max_width>>>>>>>>; }; +// Modulo cannot overflow, so no width-doubling is needed (unlike +/-/*). +// For mixed signed/unsigned integers, always use a signed type so that Python/Pandas +// sign semantics (result sign follows divisor) can produce negative results. +template +struct binary_operation_promoted_type { + static constexpr size_t max_width = arithmetic_promoted_type::details::max_width_v; + using type = std::conditional_t< + std::is_floating_point_v || std::is_floating_point_v, + std::conditional_t< + std::is_floating_point_v && std::is_floating_point_v, + std::conditional_t, + double>, + std::conditional_t< + std::is_unsigned_v && std::is_unsigned_v, + arithmetic_promoted_type::details::unsigned_width_t, + arithmetic_promoted_type::details::signed_width_t>>; +}; + template struct ternary_operation_promoted_type { static constexpr size_t max_width = arithmetic_promoted_type::details::max_width_v; @@ -356,6 +378,33 @@ struct DivideOperator { } }; +struct ModOperator { + template::type> + V apply(T t, U u) { + if constexpr (std::is_floating_point_v) { + const auto lhs = static_cast(t); + const auto rhs = static_cast(u); + // Match Python/Pandas modulo semantics where the result has the sign of the divisor. + auto result = std::fmod(lhs, rhs); + if (result != V{0} && ((rhs < V{0}) != (result < V{0}))) { + result += rhs; + } + return result; + } else { + auto lhs = static_cast(t); + auto rhs = static_cast(u); + auto result = lhs % rhs; + if constexpr (std::is_signed_v) { + // Match Python/Pandas modulo semantics where the result has the sign of the divisor. + if (result != V{0} && ((rhs < V{0}) != (result < V{0}))) { + result += rhs; + } + } + return result; + } + } +}; + struct EqualsOperator { template bool operator()(T t, U u) const { @@ -715,6 +764,19 @@ struct formatter { } }; +template<> +struct formatter { + template + constexpr auto parse(ParseContext& ctx) { + return ctx.begin(); + } + + template + constexpr auto format(arcticdb::ModOperator, FormatContext& ctx) const { + return fmt::format_to(ctx.out(), "%"); + } +}; + template<> struct formatter { template diff --git a/cpp/arcticdb/processing/test/test_operation_dispatch.cpp b/cpp/arcticdb/processing/test/test_operation_dispatch.cpp index bf40c442650..ff9be824049 100644 --- a/cpp/arcticdb/processing/test/test_operation_dispatch.cpp +++ b/cpp/arcticdb/processing/test/test_operation_dispatch.cpp @@ -73,6 +73,52 @@ TEST(OperationDispatch, binary_operator) { EXPECT_THROW(visit_binary_operator(empty_column, value, PlusOperator{}), SchemaException); // val + empty col EXPECT_THROW(visit_binary_operator(value, empty_column, PlusOperator{}), SchemaException); + + // int col % val + auto modulo_value = std::make_shared(static_cast(7), DataType::INT64); + auto variant_data = visit_binary_operator(int_column, modulo_value, ModOperator{}); + ASSERT_TRUE(std::holds_alternative(variant_data)); + auto results_column = std::get(variant_data).column_; + for (size_t idx = 0; idx < num_rows; idx++) { + ASSERT_EQ(static_cast(idx) % 7, results_column->scalar_at(idx)); + } + + auto one = std::make_shared(static_cast(1), DataType::INT64); + auto nonzero_column_variant = visit_binary_operator(int_column, one, PlusOperator{}); + ASSERT_TRUE(std::holds_alternative(nonzero_column_variant)); + auto nonzero_column = std::get(nonzero_column_variant); + + // int col % int col (with non-zero divisor column) + auto variant_data_col_col = visit_binary_operator(int_column, nonzero_column, ModOperator{}); + ASSERT_TRUE(std::holds_alternative(variant_data_col_col)); + auto results_column_col_col = std::get(variant_data_col_col).column_; + for (size_t idx = 0; idx < num_rows; idx++) { + ASSERT_EQ(static_cast(idx), results_column_col_col->scalar_at(idx)); + } + + // val % int col. + auto variant_data_val_col = visit_binary_operator(modulo_value, nonzero_column, ModOperator{}); + ASSERT_TRUE(std::holds_alternative(variant_data_val_col)); + auto results_column_val_col = std::get(variant_data_val_col).column_; + for (size_t idx = 0; idx < num_rows; idx++) { + ASSERT_EQ( + static_cast(7) % static_cast(idx + 1), + results_column_val_col->scalar_at(idx) + ); + } + + // Match Python/Pandas behavior for negative floating-point values. + auto minus_three = std::make_shared(-3.0, DataType::FLOAT64); + auto plus_two = std::make_shared(2.0, DataType::FLOAT64); + auto variant_data_float = visit_binary_operator(minus_three, plus_two, ModOperator{}); + ASSERT_TRUE(std::holds_alternative>(variant_data_float)); + ASSERT_DOUBLE_EQ(std::get>(variant_data_float)->get(), 1.0); + + auto minus_two = std::make_shared(-2.0, DataType::FLOAT64); + auto variant_data_float_neg_divisor = visit_binary_operator(minus_three, minus_two, ModOperator{}); + ASSERT_TRUE(std::holds_alternative>(variant_data_float_neg_divisor)); + ASSERT_DOUBLE_EQ(std::get>(variant_data_float_neg_divisor)->get(), -1.0); + } TEST(OperationDispatch, binary_comparator) { @@ -145,4 +191,4 @@ TEST(OperationDispatch, binary_membership) { // empty col isnotin set ASSERT_TRUE(std::holds_alternative(visit_binary_membership(empty_column, value_set, IsNotInOperator{})) ); -} \ No newline at end of file +} diff --git a/cpp/arcticdb/version/python_bindings.cpp b/cpp/arcticdb/version/python_bindings.cpp index 1a553faf623..85b81fe0e7e 100644 --- a/cpp/arcticdb/version/python_bindings.cpp +++ b/cpp/arcticdb/version/python_bindings.cpp @@ -472,6 +472,7 @@ void register_bindings(py::module& version, py::exception= pd.Timedelta(minutes=10)) & (minute_in_hour < pd.Timedelta(minutes=11))] +library.read("test_frame", query_builder=q).data +``` + ### Modifications, Versioning (aka Time Travel) ArcticDB fully supports modifying stored data via two primitives: _update_ and _append_. @@ -509,4 +518,3 @@ For concurrent access to a local backend, we recommend LMDB connected to tmpfs, - So why pay the cost of transactions when they are often not needed? - ArcticDB doesn't have transactions because it is designed for high throughput analytical workloads - diff --git a/python/arcticdb/version_store/processing.py b/python/arcticdb/version_store/processing.py index fba2773c878..ed7c29330ed 100644 --- a/python/arcticdb/version_store/processing.py +++ b/python/arcticdb/version_store/processing.py @@ -126,6 +126,9 @@ def __mul__(self, right): def __truediv__(self, right): return self._apply(right, _OperationType.DIV) + def __mod__(self, right): + return self._apply(right, _OperationType.MOD) + def __eq__(self, right): if is_supported_sequence(right): return self.isin(right) @@ -186,6 +189,9 @@ def __rmul__(self, left): def __rtruediv__(self, left): return self._rapply(left, _OperationType.DIV) + def __rmod__(self, left): + return self._rapply(left, _OperationType.MOD) + def __rand__(self, left): if left is True: return self @@ -435,7 +441,7 @@ class QueryBuilder: Supported arithmetic operations when projection or filtering: - * Binary arithmetic: +, -, *, / + * Binary arithmetic: +, -, *, /, % * Unary arithmetic: -, abs Supported filtering operations: diff --git a/python/tests/unit/arcticdb/version_store/test_filtering.py b/python/tests/unit/arcticdb/version_store/test_filtering.py index 0d970e741e0..94b8a95045f 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering.py @@ -276,6 +276,22 @@ def test_filter_datetime_timedelta(lmdb_version_store_v1, any_output_format): assert True +def test_filter_datetime_index_by_minute_with_modulo(lmdb_version_store_tiny_segment, any_output_format): + lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) + symbol = "test_filter_datetime_index_by_minute_with_modulo" + index = pd.date_range("2024-01-01", periods=180, freq="min") + df = pd.DataFrame({"col": np.arange(index.shape[0], dtype=np.int64)}, index=index) + lib.write(symbol, df) + + q = QueryBuilder() + minute_in_hour = q["index"] % pd.Timedelta(hours=1) + q = q[(minute_in_hour >= pd.Timedelta(minutes=10)) & (minute_in_hour < pd.Timedelta(minutes=11))] + + expected = df[df.index.minute == 10] + generic_filter_test(lib, symbol, q, expected) + + def test_filter_datetime_timezone_aware(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 lib._set_output_format_for_pipeline_tests(any_output_format) diff --git a/python/tests/unit/arcticdb/version_store/test_projection.py b/python/tests/unit/arcticdb/version_store/test_projection.py index cc44ab0c97f..80d122f68c3 100644 --- a/python/tests/unit/arcticdb/version_store/test_projection.py +++ b/python/tests/unit/arcticdb/version_store/test_projection.py @@ -10,7 +10,7 @@ import pandas as pd import pytest -from arcticdb_ext.exceptions import InternalException, UserInputException +from arcticdb_ext.exceptions import ArcticException, InternalException, UserInputException from arcticdb.exceptions import ArcticNativeException from arcticdb.version_store.processing import QueryBuilder from arcticdb.util.test import assert_frame_equal, make_dynamic, regularize_dataframe @@ -18,6 +18,11 @@ pytestmark = pytest.mark.pipeline +def _assert_projection_matches(lib, symbol, query_builder, expected): + received = regularize_dataframe(lib.read(symbol, query_builder=query_builder).data) + assert_frame_equal(regularize_dataframe(expected), received) + + def test_project_column_not_present(lmdb_version_store_v1, any_output_format): lib = lmdb_version_store_v1 lib._set_output_format_for_pipeline_tests(any_output_format) @@ -110,6 +115,121 @@ def test_docstring_example_query_builder_apply(lmdb_version_store_v1, any_output assert_frame_equal(df.astype({"ADJUSTED": "int64"}), data) +def test_projection_modulo_value_and_column_operands(lmdb_version_store_tiny_segment, any_output_format): + lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) + symbol = "test_projection_modulo_value_and_column_operands" + df = pd.DataFrame( + { + "a": np.arange(1, 11, dtype=np.int64), + "b": np.arange(11, 21, dtype=np.int64), + }, + index=np.arange(10), + ) + lib.write(symbol, df) + + q = QueryBuilder() + q = q.apply("a_mod_3", q["a"] % 3) + q = q.apply("20_mod_a", 20 % q["a"]) + q = q.apply("a_mod_b", q["a"] % q["b"]) + + expected = df.copy() + expected["a_mod_3"] = expected["a"] % 3 + expected["20_mod_a"] = 20 % expected["a"] + expected["a_mod_b"] = expected["a"] % expected["b"] + + _assert_projection_matches(lib, symbol, q, expected) + + +def test_projection_modulo_negative_integers_and_floats(lmdb_version_store_tiny_segment, any_output_format): + lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) + symbol = "test_projection_modulo_negative_integers_and_floats" + df = pd.DataFrame( + { + "int_col": np.array([-5, -4, -3, 3, 4, 5], dtype=np.int64), + "float_col": np.array([-5.5, -4.5, -3.5, 3.5, 4.5, 5.5], dtype=np.float64), + }, + index=np.arange(6), + ) + lib.write(symbol, df) + + q = QueryBuilder() + q = q.apply("int_mod_pos", q["int_col"] % 2) + q = q.apply("int_mod_neg", q["int_col"] % -2) + q = q.apply("float_mod_pos", q["float_col"] % 2.0) + q = q.apply("float_mod_neg", q["float_col"] % -2.0) + + expected = df.copy() + expected["int_mod_pos"] = expected["int_col"] % 2 + expected["int_mod_neg"] = expected["int_col"] % -2 + expected["float_mod_pos"] = expected["float_col"] % 2.0 + expected["float_mod_neg"] = expected["float_col"] % -2.0 + + _assert_projection_matches(lib, symbol, q, expected) + + +def test_projection_modulo_special_float_rhs_values(lmdb_version_store_tiny_segment, any_output_format): + lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) + symbol = "test_projection_modulo_special_float_rhs_values" + df = pd.DataFrame({"float_col": np.array([1.0, -1.0, 2.5, -2.5, np.nan], dtype=np.float64)}, index=np.arange(5)) + lib.write(symbol, df) + + q = QueryBuilder() + q = q.apply("mod_zero", q["float_col"] % 0.0) + q = q.apply("mod_nan", q["float_col"] % np.nan) + + expected = df.copy() + expected["mod_zero"] = expected["float_col"] % 0.0 + expected["mod_nan"] = expected["float_col"] % np.nan + + _assert_projection_matches(lib, symbol, q, expected) + + +def test_projection_modulo_infinite_rhs_raises(): + q = QueryBuilder() + with pytest.raises(ArcticException, match="Infinite values not supported in queries"): + q.apply("mod_inf", q["col"] % np.inf) + with pytest.raises(ArcticException, match="Infinite values not supported in queries"): + q.apply("mod_neg_inf", q["col"] % -np.inf) + + +def test_projection_modulo_mixed_type_non_representable_values(lmdb_version_store_tiny_segment, any_output_format): + lib = lmdb_version_store_tiny_segment + lib._set_output_format_for_pipeline_tests(any_output_format) + symbol = "test_projection_modulo_mixed_type_non_representable_values" + df = pd.DataFrame( + { + "u8_nonzero": np.array([10, 255, 1], dtype=np.uint8), + "i64_large": np.array([300, 301, 302], dtype=np.int64), + "f64_large": np.array([300.25, 301.25, 302.25], dtype=np.float64), + "u64_small": np.array([2, 3, 4], dtype=np.uint64), + "i64_negative": np.array([-1, -2, -3], dtype=np.int64), + }, + index=np.arange(3), + ) + lib.write(symbol, df) + + q = QueryBuilder() + q = q.apply("u8_mod_i64", q["u8_nonzero"] % q["i64_large"]) + q = q.apply("i64_mod_u8", q["i64_large"] % q["u8_nonzero"]) + q = q.apply("u8_mod_f64", q["u8_nonzero"] % q["f64_large"]) + q = q.apply("f64_mod_u8", q["f64_large"] % q["u8_nonzero"]) + q = q.apply("i64_neg_mod_u64", q["i64_negative"] % q["u64_small"]) + q = q.apply("u64_mod_i64_neg", q["u64_small"] % q["i64_negative"]) + + expected = df.copy() + expected["u8_mod_i64"] = expected["u8_nonzero"] % expected["i64_large"] + expected["i64_mod_u8"] = expected["i64_large"] % expected["u8_nonzero"] + expected["u8_mod_f64"] = expected["u8_nonzero"] % expected["f64_large"] + expected["f64_mod_u8"] = expected["f64_large"] % expected["u8_nonzero"] + expected["i64_neg_mod_u64"] = expected["i64_negative"] % expected["u64_small"] + expected["u64_mod_i64_neg"] = expected["u64_small"] % expected["i64_negative"] + + _assert_projection_matches(lib, symbol, q, expected) + + ################################## # DYNAMIC SCHEMA TESTS FROM HERE # ##################################