Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions cpp/arcticdb/column_store/column_algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,17 @@ ColumnData::ColumnDataIterator<TDT, IT, ID, true> bound_search(

const size_t block_pos = first_block;
const RawType* block_ptr = block_data_at(block_pos);
const size_t block_row_count = block_row_count_at(block_pos);
const size_t first = (block_pos == begin_block) ? begin_in_block_offset : 0;
const size_t last = (block_pos == end_block) ? end_in_block_offset : block_row_count_at(block_pos);
const size_t last = (block_pos == end_block) ? end_in_block_offset : block_row_count;
const RawType* found = bisect(block_ptr + first, block_ptr + last, value);
if (found == block_ptr + last) {
// If bisect doesn't find the result in the block, there is no result in the given [begin, end)
return end;
}
return ColumnData::ColumnDataIterator<TDT, IT, ID, true>(data, block_pos, static_cast<size_t>(found - block_ptr));
return ColumnData::ColumnDataIterator<TDT, IT, ID, true>(
data, block_pos, static_cast<size_t>(found - block_ptr), block_ptr, block_row_count
);
}

// Gallop forward from `begin` in steps of 2**n until an element after value is reached.
Expand Down Expand Up @@ -456,7 +459,22 @@ gallop_bracket(
if (block > end_block_idx || (block == end_block_idx && offset >= end_in_block_offset)) {
return end;
}
return ColumnData::ColumnDataIterator<TDT, IT, ID, true>(data, block, offset);
return ColumnData::ColumnDataIterator<TDT, IT, ID, true>(
data, block, offset, block_data_at(block), block_row_count_at(block)
);
};

// Optimized variants for the first block
auto record_probe_in_first_block = [&](size_t next_offset, RawType probe_value) {
prev_offset = cur_offset;
cur_offset = next_offset;
return is_before(probe_value, value);
};
Comment on lines +468 to +472
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these two extra assignments that are omitted really make a difference?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most probably they do not.

Most of the benefit is from reusing the already calculated first_block_row_count and first_block_data in make_iter_in_first_block.

It made sense to also add a first block variant of record_probe as well, to make the invariant clearer


auto make_iter_in_first_block = [&](size_t offset) -> ColumnData::ColumnDataIterator<TDT, IT, ID, true> {
return ColumnData::ColumnDataIterator<TDT, IT, ID, true>(
data, first_block_idx, offset, first_block_data, first_block_row_count
);
};

// Probe within the first block at first_offset + 2**n
Expand All @@ -466,19 +484,19 @@ gallop_bracket(
size_t step = 1;
for (; first_offset + step + 1 < up_to; step *= 2) {
const size_t probe_offset = first_offset + step;
if (!record_probe(first_block_idx, probe_offset + 1, first_block_data[probe_offset])) {
return {make_iter(prev_block, prev_offset), make_iter(cur_block, cur_offset)};
if (!record_probe_in_first_block(probe_offset + 1, first_block_data[probe_offset])) {
return {make_iter_in_first_block(prev_offset), make_iter_in_first_block(cur_offset)};
}
}

if (end_block_idx == first_block_idx) {
// End lies in the first block; resulting range is [cur, end).
return {make_iter(cur_block, cur_offset), end};
return {make_iter_in_first_block(cur_offset), end};
}

// Probe the last element of the first block. Post-probe position is (first_block_idx + 1, 0).
if (!record_probe(first_block_idx + 1, 0, first_block_data[first_block_row_count - 1])) {
return {make_iter(prev_block, prev_offset), make_iter(cur_block, cur_offset)};
return {make_iter_in_first_block(prev_offset), make_iter(cur_block, cur_offset)};
}

// Answer is after the first block — probe the last elements of blocks at first_idx + 2**n
Expand Down Expand Up @@ -609,7 +627,7 @@ size_t lower_bound_idx(
? column_data.template citerator_at<TDT, IteratorType::ENUMERATED>(*to)
: column_data.template cend<TDT, IteratorType::ENUMERATED, IteratorDensity::DENSE>();
auto result = lower_bound<TDT, IteratorType::ENUMERATED, IteratorDensity::DENSE>(begin, end, value);
if (!result.current_block().has_value()) {
if (result.current_block_data() == nullptr) {
// Iterator to end doesn't have `->idx()`
return column.row_count();
}
Expand Down Expand Up @@ -637,7 +655,7 @@ size_t upper_bound_idx(
? column_data.template citerator_at<TDT, IteratorType::ENUMERATED>(*to)
: column_data.template cend<TDT, IteratorType::ENUMERATED, IteratorDensity::DENSE>();
auto result = upper_bound<TDT, IteratorType::ENUMERATED, IteratorDensity::DENSE>(begin, end, value);
if (!result.current_block().has_value()) {
if (result.current_block_data() == nullptr) {
// Iterator to end doesn't have `->idx()`
return column.row_count();
}
Expand Down
74 changes: 50 additions & 24 deletions cpp/arcticdb/column_store/column_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,28 +206,36 @@ struct ColumnData {
block_pos_(block_pos) {
load_current_block();
in_block_offset_ = in_block_offset;
if constexpr (iterator_type == IteratorType::ENUMERATED) {
if constexpr (iterator_density == IteratorDensity::SPARSE) {
util::raise_rte("ColumnDataIterator at-position constructor not supported for SPARSE iteration");
} else {
const size_t block_start_idx = parent_->buffer().block_byte_offset(block_pos) / sizeof(RawType);
data_.idx_ = static_cast<ssize_t>(block_start_idx + in_block_offset);
}
}
recalculate_enumeration();
}

// Optimized constructor that can skip `load_current_block` if caller already has `block_data` and
// `block_row_count` Frequently used in search methods.
ColumnDataIterator(
const ColumnData* parent, size_t block_pos, size_t in_block_offset, const RawType* block_data,
size_t block_row_count
) :
parent_(parent),
block_pos_(block_pos),
block_begin_(block_data),
in_block_offset_(in_block_offset),
block_size_(block_row_count) {
recalculate_enumeration();
}

template<bool OtherConst>
explicit ColumnDataIterator(const ColumnDataIterator<TDT, iterator_type, iterator_density, OtherConst>& other) :
parent_(other.parent_),
block_pos_(other.block_pos_),
opt_block_(other.opt_block_),
block_begin_(other.block_begin_),
in_block_offset_(other.in_block_offset_),
block_size_(other.block_size_),
data_(other.data_) {}

// Minimal accessors used by the search algorithms in column_algorithms.hpp.
[[nodiscard]] const ColumnData* parent() const { return parent_; }
[[nodiscard]] const std::optional<TypedBlockData<TDT>>& current_block() const { return opt_block_; }
// nullptr when the iterator is at end (no current block).
[[nodiscard]] const RawType* current_block_data() const { return block_begin_; }
// Index of the block this iterator currently points into. For end iterators this is num_blocks.
[[nodiscard]] size_t current_block_index() const { return block_pos_; }
[[nodiscard]] size_t current_in_block_offset() const { return in_block_offset_; }
Expand All @@ -253,23 +261,40 @@ struct ColumnData {
}

void load_current_block() {
opt_block_ = parent_->template typed_block_at_position<TDT>(block_pos_);
block_size_ = opt_block_.has_value() ? opt_block_->row_count() : 0;
const size_t num_blocks = parent_->num_blocks();
const auto& blocks = parent_->buffer().blocks();
in_block_offset_ = 0;
// It is possible for arrow sparse data to have blocks with zero set rows.
// We need to skip all blocks with zero rows to get to the next iterator position.
while (ARCTICDB_UNLIKELY(opt_block_.has_value() && block_size_ == 0)) {
// Skip all such blocks to get to the next iterator position.
while (block_pos_ < num_blocks) {
IMemBlock* block = blocks[block_pos_];
block_begin_ = reinterpret_cast<const RawType*>(block->data());
block_size_ = block->logical_size() / sizeof(RawType);
if (ARCTICDB_LIKELY(block_size_ != 0)) {
return;
}
++block_pos_;
opt_block_ = parent_->template typed_block_at_position<TDT>(block_pos_);
block_size_ = opt_block_.has_value() ? opt_block_->row_count() : 0;
}
in_block_offset_ = 0;
block_begin_ = nullptr;
block_size_ = 0;
}

void advance_block() {
++block_pos_;
load_current_block();
}

void recalculate_enumeration() {
if constexpr (iterator_type == IteratorType::ENUMERATED) {
if constexpr (iterator_density == IteratorDensity::SPARSE) {
util::raise_rte("ColumnDataIterator at-position constructor not supported for SPARSE iteration");
} else {
const size_t block_start_idx = parent_->buffer().block_byte_offset(block_pos_) / sizeof(RawType);
data_.idx_ = static_cast<ssize_t>(block_start_idx + in_block_offset_);
}
}
}

template<bool OtherConst>
bool equal(const ColumnDataIterator<TDT, iterator_type, iterator_density, OtherConst>& other) const {
ARCTICDB_DEBUG_CHECK(
Expand All @@ -285,14 +310,14 @@ struct ColumnData {
{
ARCTICDB_DEBUG_CHECK(
ErrorCode::E_ASSERTION_FAILURE,
opt_block_.has_value(),
block_begin_ != nullptr,
"Dereferencing end iterator in ColumnDataIterator"
);
if constexpr (iterator_type == IteratorType::ENUMERATED) {
data_.ptr_ = const_cast<RawType*>(opt_block_->data() + in_block_offset_);
data_.ptr_ = const_cast<RawType*>(block_begin_ + in_block_offset_);
return data_;
} else {
return *(opt_block_->data() + in_block_offset_);
return *(block_begin_ + in_block_offset_);
}
}

Expand All @@ -301,20 +326,21 @@ struct ColumnData {
{
ARCTICDB_DEBUG_CHECK(
ErrorCode::E_ASSERTION_FAILURE,
opt_block_.has_value(),
block_begin_ != nullptr,
"Dereferencing end iterator in ColumnDataIterator"
);
if constexpr (iterator_type == IteratorType::ENUMERATED) {
data_.ptr_ = const_cast<RawType*>(opt_block_->data() + in_block_offset_);
data_.ptr_ = const_cast<RawType*>(block_begin_ + in_block_offset_);
return *const_cast<typename base_type::value_type*>(&data_);
} else {
return *const_cast<RawType*>(opt_block_->data() + in_block_offset_);
return *const_cast<RawType*>(block_begin_ + in_block_offset_);
}
}

const ColumnData* parent_{nullptr};
size_t block_pos_{0};
std::optional<TypedBlockData<TDT>> opt_block_{std::nullopt};
// Raw pointer to the start of the current block's data, nullptr at end.
const RawType* block_begin_{nullptr};
size_t in_block_offset_{0};
size_t block_size_{0};
// Mutable is to allow assigning `ptr_` lazily on dereference for ENUMERATED
Expand Down
2 changes: 1 addition & 1 deletion cpp/arcticdb/column_store/test/test_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ void check_search_on_column(
: column_data.cbegin<SearchTDT, IteratorType::ENUMERATED, IteratorDensity::DENSE>();
auto end = to.has_value() ? column_data.citerator_at<SearchTDT, IteratorType::ENUMERATED>(*to)
: column_data.cend<SearchTDT, IteratorType::ENUMERATED, IteratorDensity::DENSE>();
auto res_idx = [&](auto& it) { return it.current_block().has_value() ? it->idx() : total; };
auto res_idx = [&](auto& it) { return it.current_block_data() != nullptr ? it->idx() : total; };
const auto std_begin = values.begin() + from.value_or(0);
const auto std_end = values.begin() + to.value_or(values.size());
for (auto v : probes) {
Expand Down
Loading