Skip to content

Commit

Permalink
Reduce iterator key comparison for upper/lower bound check (2nd attem…
Browse files Browse the repository at this point in the history
…pt) (#5468)

Summary:
This is a second attempt for #5111, with the fix to redo iterate bounds check after `SeekXXX()`. This is because MyRocks may change iterate bounds between seek.

See #5111 for original benchmark result and discussion.

Closes #5463.
Pull Request resolved: #5468

Test Plan: Existing rocksdb tests, plus myrocks test `rocksdb.optimizer_loose_index_scans` and `rocksdb.group_min_max`.

Differential Revision: D15863332

fbshipit-source-id: ab4aba5899838591806b8673899bd465f3f53e18
  • Loading branch information
Yi Wu authored and facebook-github-bot committed Jul 2, 2019
1 parent cfdf211 commit 662ce62
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 24 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* Reduce binary search when iterator reseek into the same data block.
* DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
* Merging iterator to avoid child iterator reseek for some cases
* Reduce iterator key comparision for upper/lower bound check.
* Log Writer will flush after finishing the whole record, rather than a fragment.
* Lower MultiGet batching API latency by reading data blocks from disk in parallel

Expand Down
9 changes: 7 additions & 2 deletions db/db_iter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)

is_key_seqnum_zero_ = (ikey_.sequence == 0);

if (iterate_upper_bound_ != nullptr &&
assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
break;
}
Expand Down Expand Up @@ -859,7 +861,10 @@ void DBIter::PrevInternal() {
return;
}

if (iterate_lower_bound_ != nullptr &&
assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
user_comparator_.Compare(saved_key_.GetUserKey(),
*iterate_lower_bound_) >= 0);
if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
user_comparator_.Compare(saved_key_.GetUserKey(),
*iterate_lower_bound_) < 0) {
// We've iterated earlier than the user-specified lower bound.
Expand Down
62 changes: 62 additions & 0 deletions db/db_iterator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2759,6 +2759,68 @@ TEST_P(DBIteratorTest, AvoidReseekChildIterator) {
SyncPoint::GetInstance()->DisableProcessing();
}

// MyRocks may change iterate bounds before seek. Simply test to make sure such
// usage doesn't break iterator.
TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
Options options = CurrentOptions();
options.compression = CompressionType::kNoCompression;
BlockBasedTableOptions table_options;
table_options.block_size = 100;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
std::string value(50, 'v');
Reopen(options);
ASSERT_OK(Put("aaa", value));
ASSERT_OK(Flush());
ASSERT_OK(Put("bbb", "v"));
ASSERT_OK(Put("ccc", "v"));
ASSERT_OK(Put("ddd", "v"));
ASSERT_OK(Flush());
ASSERT_OK(Put("eee", "v"));
ASSERT_OK(Flush());
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));

std::string ub1 = "e";
std::string ub2 = "c";
Slice ub(ub1);
ReadOptions read_opts1;
read_opts1.iterate_upper_bound = &ub;
Iterator* iter = NewIterator(read_opts1);
// Seek and iterate accross block boundary.
iter->Seek("b");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("bbb", iter->key());
ub = Slice(ub2);
iter->Seek("b");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("bbb", iter->key());
iter->Next();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
delete iter;

std::string lb1 = "a";
std::string lb2 = "c";
Slice lb(lb1);
ReadOptions read_opts2;
read_opts2.iterate_lower_bound = &lb;
iter = NewIterator(read_opts2);
iter->SeekForPrev("d");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("ccc", iter->key());
lb = Slice(lb2);
iter->SeekForPrev("d");
ASSERT_TRUE(iter->Valid());
ASSERT_OK(iter->status());
ASSERT_EQ("ccc", iter->key());
iter->Prev();
ASSERT_FALSE(iter->Valid());
ASSERT_OK(iter->status());
delete iter;
}

INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
testing::Values(true, false));

Expand Down
48 changes: 41 additions & 7 deletions db/version_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -887,31 +887,46 @@ class LevelIterator final : public InternalIterator {
void SeekToFirst() override;
void SeekToLast() override;
void Next() final override;
bool NextAndGetResult(Slice* ret_key) override;
bool NextAndGetResult(IterateResult* result) override;
void Prev() override;

bool Valid() const override { return file_iter_.Valid(); }
Slice key() const override {
assert(Valid());
return file_iter_.key();
}

Slice value() const override {
assert(Valid());
return file_iter_.value();
}

Status status() const override {
return file_iter_.iter() ? file_iter_.status() : Status::OK();
}

inline bool MayBeOutOfLowerBound() override {
assert(Valid());
return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
}

inline bool MayBeOutOfUpperBound() override {
assert(Valid());
return file_iter_.MayBeOutOfUpperBound();
}

void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
if (file_iter_.iter()) {
file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
}
}

bool IsKeyPinned() const override {
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
file_iter_.iter() && file_iter_.IsKeyPinned();
}

bool IsValuePinned() const override {
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
file_iter_.iter() && file_iter_.IsValuePinned();
Expand Down Expand Up @@ -955,6 +970,7 @@ class LevelIterator final : public InternalIterator {
smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
}
CheckMayBeOutOfLowerBound();
return table_cache_->NewIterator(
read_options_, env_options_, icomparator_, *file_meta.file_metadata,
range_del_agg_, prefix_extractor_,
Expand All @@ -963,6 +979,19 @@ class LevelIterator final : public InternalIterator {
largest_compaction_key);
}

// Check if current file being fully within iterate_lower_bound.
//
// Note MyRocks may update iterate bounds between seek. To workaround it,
// we need to check and update may_be_out_of_lower_bound_ accordingly.
void CheckMayBeOutOfLowerBound() {
if (Valid() && read_options_.iterate_lower_bound != nullptr) {
may_be_out_of_lower_bound_ =
user_comparator_.Compare(
ExtractUserKey(file_smallest_key(file_index_)),
*read_options_.iterate_lower_bound) < 0;
}
}

TableCache* table_cache_;
const ReadOptions read_options_;
const EnvOptions& env_options_;
Expand All @@ -976,6 +1005,7 @@ class LevelIterator final : public InternalIterator {
bool should_sample_;
TableReaderCaller caller_;
bool skip_filters_;
bool may_be_out_of_lower_bound_ = true;
size_t file_index_;
int level_;
RangeDelAggregator* range_del_agg_;
Expand Down Expand Up @@ -1011,6 +1041,7 @@ void LevelIterator::Seek(const Slice& target) {
file_iter_.Seek(target);
}
SkipEmptyFileForward();
CheckMayBeOutOfLowerBound();
}

void LevelIterator::SeekForPrev(const Slice& target) {
Expand All @@ -1024,6 +1055,7 @@ void LevelIterator::SeekForPrev(const Slice& target) {
file_iter_.SeekForPrev(target);
SkipEmptyFileBackward();
}
CheckMayBeOutOfLowerBound();
}

void LevelIterator::SeekToFirst() {
Expand All @@ -1032,6 +1064,7 @@ void LevelIterator::SeekToFirst() {
file_iter_.SeekToFirst();
}
SkipEmptyFileForward();
CheckMayBeOutOfLowerBound();
}

void LevelIterator::SeekToLast() {
Expand All @@ -1040,15 +1073,17 @@ void LevelIterator::SeekToLast() {
file_iter_.SeekToLast();
}
SkipEmptyFileBackward();
CheckMayBeOutOfLowerBound();
}

void LevelIterator::Next() { NextImpl(); }

bool LevelIterator::NextAndGetResult(Slice* ret_key) {
bool LevelIterator::NextAndGetResult(IterateResult* result) {
NextImpl();
bool is_valid = Valid();
if (is_valid) {
*ret_key = key();
result->key = key();
result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
}
return is_valid;
}
Expand Down Expand Up @@ -4366,10 +4401,9 @@ Status VersionSet::Recover(
", last_sequence is %" PRIu64 ", log_number is %" PRIu64
",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
",min_log_number_to_keep is %" PRIu64 "\n",
manifest_path.c_str(), manifest_file_number_,
next_file_number_.load(), last_sequence_.load(), log_number,
prev_log_number_, column_family_set_->GetMaxColumnFamily(),
min_log_number_to_keep_2pc());
manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
last_sequence_.load(), log_number, prev_log_number_,
column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());

for (auto cfd : *column_family_set_) {
if (cfd->IsDropped()) {
Expand Down
30 changes: 22 additions & 8 deletions table/block_based/block_based_table_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2896,6 +2896,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
FindKeyForward();
}

CheckDataBlockWithinUpperBound();
CheckOutOfBound();

if (target) {
Expand Down Expand Up @@ -2952,6 +2953,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
block_iter_.SeekForPrev(target);

FindKeyBackward();
CheckDataBlockWithinUpperBound();
assert(!block_iter_.Valid() ||
icomp_.Compare(target, block_iter_.key()) >= 0);
}
Expand All @@ -2969,6 +2971,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
InitDataBlock();
block_iter_.SeekToLast();
FindKeyBackward();
CheckDataBlockWithinUpperBound();
}

template <class TBlockIter, typename TValue>
Expand All @@ -2984,11 +2987,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Next() {

template <class TBlockIter, typename TValue>
bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
Slice* ret_key) {
IterateResult* result) {
Next();
bool is_valid = Valid();
if (is_valid) {
*ret_key = key();
result->key = key();
result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
}
return is_valid;
}
Expand Down Expand Up @@ -3087,6 +3091,7 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
/*for_compaction=*/lookup_context_.caller ==
TableReaderCaller::kCompaction);
block_iter_points_to_real_block_ = true;
CheckDataBlockWithinUpperBound();
}
}

Expand Down Expand Up @@ -3140,13 +3145,12 @@ void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
return;
}
// Whether next data block is out of upper bound, if there is one.
bool next_block_is_out_of_bound = false;
if (read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_) {
next_block_is_out_of_bound =
(user_comparator_.Compare(*read_options_.iterate_upper_bound,
const bool next_block_is_out_of_bound =
read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
assert(!next_block_is_out_of_bound ||
user_comparator_.Compare(*read_options_.iterate_upper_bound,
index_iter_->user_key()) <= 0);
}
ResetDataIter();
index_iter_->Next();
if (next_block_is_out_of_bound) {
Expand Down Expand Up @@ -3210,6 +3214,16 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
}
}

template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::CheckDataBlockWithinUpperBound() {
if (read_options_.iterate_upper_bound != nullptr &&
block_iter_points_to_real_block_) {
data_block_within_upper_bound_ =
(user_comparator_.Compare(*read_options_.iterate_upper_bound,
index_iter_->user_key()) > 0);
}
}

InternalIterator* BlockBasedTable::NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, TableReaderCaller caller, size_t compaction_readahead_size) {
Expand Down
15 changes: 14 additions & 1 deletion table/block_based/block_based_table_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
void SeekToFirst() override;
void SeekToLast() override;
void Next() final override;
bool NextAndGetResult(Slice* ret_key) override;
bool NextAndGetResult(IterateResult* result) override;
void Prev() override;
bool Valid() const override {
return !is_out_of_bound_ &&
Expand Down Expand Up @@ -702,6 +702,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
// Whether iterator invalidated for being out of bound.
bool IsOutOfBound() override { return is_out_of_bound_; }

inline bool MayBeOutOfUpperBound() override {
assert(Valid());
return !data_block_within_upper_bound_;
}

void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
pinned_iters_mgr_ = pinned_iters_mgr;
}
Expand Down Expand Up @@ -768,6 +773,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
bool block_iter_points_to_real_block_;
// See InternalIteratorBase::IsOutOfBound().
bool is_out_of_bound_ = false;
// Whether current data block being fully within iterate upper bound.
bool data_block_within_upper_bound_ = false;
// True if we're standing at the first key of a block, and we haven't loaded
// that block yet. A call to value() will trigger loading the block.
bool is_at_first_key_from_index_ = false;
Expand Down Expand Up @@ -802,6 +809,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
void FindBlockForward();
void FindKeyBackward();
void CheckOutOfBound();

// Check if data block is fully within iterate_upper_bound.
//
// Note MyRocks may update iterate bounds between seek. To workaround it,
// we need to check and update data_block_within_upper_bound_ accordingly.
void CheckDataBlockWithinUpperBound();
};

} // namespace rocksdb
Loading

0 comments on commit 662ce62

Please sign in to comment.