Skip to content

Commit

Permalink
Misc hashing updates / upgrades (facebook#5909)
Browse files Browse the repository at this point in the history
Summary:
- Updated our included xxhash implementation to version 0.7.2 (== the latest dev version as of 2019-10-09).
- Using XXH_NAMESPACE (like other fb projects) to avoid potential name collisions.
- Added fastrange64, and unit tests for it and fastrange32. These are faster alternatives to hash % range.
- Use preview version of XXH3 instead of MurmurHash64A for NPHash64
-- Had to update cache_test to increase probability of passing for any given hash function.
- Use fastrange64 instead of % with uses of NPHash64
-- Had to fix WritePreparedTransactionTest.CommitOfDelayedPrepared to avoid deadlock apparently caused by new hash collision.
- Set default seed for NPHash64 because specifying a seed rarely makes sense for it.
- Removed unnecessary include xxhash.h in a popular .h file
- Rename preview version of XXH3 to XXH3p for clarity and to ease backward compatibility in case final version of XXH3 is integrated.

Relying on existing unit tests for NPHash64-related changes. Each new implementation of fastrange64 passed unit tests when manipulating my local build to select it. I haven't done any integration performance tests, but I consider the improved performance of the pieces being swapped in to be well established.
Pull Request resolved: facebook#5909

Differential Revision: D18125196

Pulled By: pdillinger

fbshipit-source-id: f6bf83d49d20cbb2549926adf454fd035f0ecc0d
  • Loading branch information
pdillinger authored and facebook-github-bot committed Oct 25, 2019
1 parent ec11eff commit ca7ccbe
Show file tree
Hide file tree
Showing 15 changed files with 3,267 additions and 1,041 deletions.
13 changes: 13 additions & 0 deletions build_tools/build_detect_platform
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,19 @@ elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
fi

$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <cstdint>
int main() {
uint64_t a = 0xffffFFFFffffFFFF;
__uint128_t b = __uint128_t(a) * a;
a = static_cast<uint64_t>(b >> 64);
(void)a;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION"
fi

# iOS doesn't support thread-local storage, but this check would erroneously
# succeed because the cross-compiler flags are added by the Makefile, not this
# script.
Expand Down
4 changes: 2 additions & 2 deletions cache/cache_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ TEST_P(CacheTest, EvictionPolicy) {
Insert(200, 201);

// Frequently used entry must be kept around
for (int i = 0; i < kCacheSize + 200; i++) {
for (int i = 0; i < kCacheSize * 2; i++) {
Insert(1000+i, 2000+i);
ASSERT_EQ(101, Lookup(100));
}
Expand Down Expand Up @@ -418,7 +418,7 @@ TEST_P(CacheTest, EvictionPolicyRef) {
Insert(303, 104);

// Insert entries much more than Cache capacity
for (int i = 0; i < kCacheSize + 200; i++) {
for (int i = 0; i < kCacheSize * 2; i++) {
Insert(1000 + i, 2000 + i);
}

Expand Down
5 changes: 3 additions & 2 deletions db/db_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -871,11 +871,12 @@ TEST_F(DBBasicTest, ChecksumTest) {
ASSERT_OK(Flush());
}

// verify data with each type of checksum
for (int i = 0; i <= kxxHash64; ++i) {
// with each valid checksum type setting...
for (int i = 0; i <= max_checksum; ++i) {
table_options.checksum = static_cast<ChecksumType>(i);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Reopen(options);
// verify every type of checksum (should be regardless of that setting)
for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
ASSERT_EQ(Key(j), Get(Key(j)));
}
Expand Down
2 changes: 1 addition & 1 deletion db/memtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
}

port::RWMutex* MemTable::GetLock(const Slice& key) {
return &locks_[static_cast<size_t>(GetSliceNPHash64(key)) % locks_.size()];
return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())];
}

MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
Expand Down
3 changes: 1 addition & 2 deletions memtable/hash_linklist_rep.cc
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,7 @@ class HashLinkListRep : public MemTableRep {
}

size_t GetHash(const Slice& slice) const {
return NPHash64(slice.data(), static_cast<int>(slice.size()), 0) %
bucket_size_;
return fastrange64(GetSliceNPHash64(slice), bucket_size_);
}

Pointer* GetBucket(size_t i) const {
Expand Down
10 changes: 6 additions & 4 deletions table/block_based/block_based_table_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -733,11 +733,13 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
break;
}
case kxxHash: {
void* xxh = XXH32_init(0);
XXH32_update(xxh, block_contents.data(),
XXH32_state_t* const state = XXH32_createState();
XXH32_reset(state, 0);
XXH32_update(state, block_contents.data(),
static_cast<uint32_t>(block_contents.size()));
XXH32_update(xxh, trailer, 1); // Extend to cover block type
EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
XXH32_update(state, trailer, 1); // Extend to cover block type
EncodeFixed32(trailer_without_type, XXH32_digest(state));
XXH32_freeState(state);
break;
}
case kxxHash64: {
Expand Down
2 changes: 0 additions & 2 deletions table/format.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
#include "port/malloc.h"
#include "port/port.h" // noexcept
#include "table/persistent_cache_options.h"
#include "util/crc32c.h"
#include "util/xxhash.h"

namespace rocksdb {

Expand Down
4 changes: 2 additions & 2 deletions trace_replay/block_cache_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
}
// We use spatial downsampling so that we have a complete access history for a
// block.
const uint64_t hash = GetSliceNPHash64(block_key);
return hash % trace_options.sampling_frequency == 0;
return 0 == fastrange64(GetSliceNPHash64(block_key),
trace_options.sampling_frequency);
}
} // namespace

Expand Down
61 changes: 43 additions & 18 deletions util/hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,25 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Simple hash function used for internal data structures
// Common hash functions with convenient interfaces.

#pragma once
#include <stddef.h>
#include <stdint.h>

#include "rocksdb/slice.h"
#include "util/murmurhash.h"
#include "util/xxhash.h"

namespace rocksdb {

// Non-persistent hash. Only used for in-memory data structure
// The hash results are applicable to change.
extern uint64_t NPHash64(const char* data, size_t n, uint32_t seed);
// Non-persistent hash. Must only used for in-memory data structure.
// The hash results are thus applicable to change. (Thus, it rarely makes
// sense to specify a seed for this function.)
inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed = 0) {
// XXH3 currently experimental, but generally faster than other quality
// 64-bit hash functions.
return XXH3p_64bits_withSeed(data, n, seed);
}

extern uint32_t Hash(const char* data, size_t n, uint32_t seed);

Expand All @@ -29,32 +34,52 @@ inline uint32_t BloomHash(const Slice& key) {
}

inline uint64_t GetSliceNPHash64(const Slice& s) {
return NPHash64(s.data(), s.size(), 0);
return NPHash64(s.data(), s.size());
}

inline uint32_t GetSliceHash(const Slice& s) {
return Hash(s.data(), s.size(), 397);
}

inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
// Right now murmurhash2B is used. It should able to be freely
// changed to a better hash, without worrying about backward
// compatibility issue.
return MURMUR_HASH(data, static_cast<int>(n),
static_cast<unsigned int>(seed));
}

// std::hash compatible interface.
struct SliceHasher {
uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
};

// An alternative to % for mapping a hash value to an arbitrary range. See
// https://github.com/lemire/fastrange and
// https://github.com/pdillinger/wormhashing/blob/2c4035a4462194bf15f3e9fc180c27c513335225/bloom_simulation_tests/foo.cc#L57
inline uint32_t fastrange32(uint32_t a, uint32_t h) {
uint64_t product = static_cast<uint64_t>(a) * h;
// https://github.com/lemire/fastrange
inline uint32_t fastrange32(uint32_t hash, uint32_t range) {
uint64_t product = uint64_t{range} * hash;
return static_cast<uint32_t>(product >> 32);
}

// An alternative to % for mapping a 64-bit hash value to an arbitrary range
// that fits in size_t. See https://github.com/lemire/fastrange
// We find size_t more convenient than uint64_t for the range, with side
// benefit of better optimization on 32-bit platforms.
inline size_t fastrange64(uint64_t hash, size_t range) {
#if defined(HAVE_UINT128_EXTENSION)
// Can use compiler's 128-bit type. Trust it to do the right thing.
__uint128_t wide = __uint128_t{range} * hash;
return static_cast<size_t>(wide >> 64);
#else
// Fall back: full decomposition.
// NOTE: GCC seems to fully understand this code as 64-bit x {32 or 64}-bit
// -> {96 or 128}-bit multiplication and optimize it down to a single
// wide-result multiplication (64-bit platform) or two wide-result
// multiplications (32-bit platforms, where range64 >> 32 is zero).
uint64_t range64 = range; // ok to shift by 32, even if size_t is 32-bit
uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
tmp >>= 32;
tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
// Avoid overflow: first add lower 32 of tmp2, and later upper 32
uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
tmp += static_cast<uint32_t>(tmp2);
tmp >>= 32;
tmp += (tmp2 >> 32);
tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
return static_cast<size_t>(tmp);
#endif
}

} // namespace rocksdb
110 changes: 110 additions & 0 deletions util/hash_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,116 @@ TEST(HashTest, Values) {
3382479516u);
}

TEST(Fastrange32Test, Values) {
using rocksdb::fastrange32;
// Zero range
EXPECT_EQ(fastrange32(0, 0), 0U);
EXPECT_EQ(fastrange32(123, 0), 0U);
EXPECT_EQ(fastrange32(0xffffffff, 0), 0U);

// One range
EXPECT_EQ(fastrange32(0, 1), 0U);
EXPECT_EQ(fastrange32(123, 1), 0U);
EXPECT_EQ(fastrange32(0xffffffff, 1), 0U);

// Two range
EXPECT_EQ(fastrange32(0, 2), 0U);
EXPECT_EQ(fastrange32(123, 2), 0U);
EXPECT_EQ(fastrange32(0x7fffffff, 2), 0U);
EXPECT_EQ(fastrange32(0x80000000, 2), 1U);
EXPECT_EQ(fastrange32(0xffffffff, 2), 1U);

// Seven range
EXPECT_EQ(fastrange32(0, 7), 0U);
EXPECT_EQ(fastrange32(123, 7), 0U);
EXPECT_EQ(fastrange32(613566756, 7), 0U);
EXPECT_EQ(fastrange32(613566757, 7), 1U);
EXPECT_EQ(fastrange32(1227133513, 7), 1U);
EXPECT_EQ(fastrange32(1227133514, 7), 2U);
// etc.
EXPECT_EQ(fastrange32(0xffffffff, 7), 6U);

// Big
EXPECT_EQ(fastrange32(1, 0x80000000), 0U);
EXPECT_EQ(fastrange32(2, 0x80000000), 1U);
EXPECT_EQ(fastrange32(4, 0x7fffffff), 1U);
EXPECT_EQ(fastrange32(4, 0x80000000), 2U);
EXPECT_EQ(fastrange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
EXPECT_EQ(fastrange32(0xffffffff, 0x80000000), 0x7fffffffU);
}

TEST(Fastrange64Test, Values) {
using rocksdb::fastrange64;
// Zero range
EXPECT_EQ(fastrange64(0, 0), 0U);
EXPECT_EQ(fastrange64(123, 0), 0U);
EXPECT_EQ(fastrange64(0xffffFFFF, 0), 0U);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0), 0U);

// One range
EXPECT_EQ(fastrange64(0, 1), 0U);
EXPECT_EQ(fastrange64(123, 1), 0U);
EXPECT_EQ(fastrange64(0xffffFFFF, 1), 0U);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 1), 0U);

// Two range
EXPECT_EQ(fastrange64(0, 2), 0U);
EXPECT_EQ(fastrange64(123, 2), 0U);
EXPECT_EQ(fastrange64(0xffffFFFF, 2), 0U);
EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 2), 0U);
EXPECT_EQ(fastrange64(0x8000000000000000, 2), 1U);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 2), 1U);

// Seven range
EXPECT_EQ(fastrange64(0, 7), 0U);
EXPECT_EQ(fastrange64(123, 7), 0U);
EXPECT_EQ(fastrange64(0xffffFFFF, 7), 0U);
EXPECT_EQ(fastrange64(2635249153387078802, 7), 0U);
EXPECT_EQ(fastrange64(2635249153387078803, 7), 1U);
EXPECT_EQ(fastrange64(5270498306774157604, 7), 1U);
EXPECT_EQ(fastrange64(5270498306774157605, 7), 2U);
EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 7), 3U);
EXPECT_EQ(fastrange64(0x8000000000000000, 7), 3U);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 7), 6U);

// Big but 32-bit range
EXPECT_EQ(fastrange64(0x100000000, 0x80000000), 0U);
EXPECT_EQ(fastrange64(0x200000000, 0x80000000), 1U);
EXPECT_EQ(fastrange64(0x400000000, 0x7fffFFFF), 1U);
EXPECT_EQ(fastrange64(0x400000000, 0x80000000), 2U);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);

// Big, > 32-bit range
#if SIZE_MAX == UINT64_MAX
EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
EXPECT_EQ(fastrange64(0x8000000000000000, 0x4200000002), 0x2100000001U);

EXPECT_EQ(fastrange64(0x0000000000000000, 420000000002), 0U);
EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
EXPECT_EQ(fastrange64(0x8000000000000000, 420000000002), 210000000001U);
EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);

EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
0xffffFFFFffffFFFEU);
#endif
}

// for inspection of disassembly
uint32_t fastrange32(uint32_t hash, uint32_t range) {
return rocksdb::fastrange32(hash, range);
}

// for inspection of disassembly
size_t fastrange64(uint64_t hash, size_t range) {
return rocksdb::fastrange64(hash, range);
}

// for inspection of disassembly
uint64_t NPHash64(const char* data, size_t n) {
return rocksdb::NPHash64(data, n);
}

int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);

Expand Down
Loading

0 comments on commit ca7ccbe

Please sign in to comment.