From 0345343cfb220e1e1588687d480a197b748e599d Mon Sep 17 00:00:00 2001 From: Martin Ankerl Date: Thu, 25 Mar 2021 08:04:27 +0100 Subject: [PATCH] overflow fix: rehash with different hash instead (#121) This uses a pretty nice idea to rehash with a different hash whenever the info byte would cause an overflow, and also improves hash quality when a bad hash is used. * The murmur finalizer is split up into 2 parts, the final part (one multiplication and one xor&shift) is *always* done, regardless of the hash used. That way the robin_hood hash is murmur's fmix, and all other hashes have an addition mixer to improve its quality. * The additional mixer uses a member variable for the multiplication constant. When an overflow is encountered, that constant is changed, thus the rehash will spread the elements completely differently. * Rehashing is tried multiple times (up to 256 times), so we won't get an endless loop if this rehashing still does not work. --- src/include/robin_hood.h | 110 ++++++++++++---------- src/test/unit/CMakeLists.txt | 1 + src/test/unit/unit_iterator_twice_bug.cpp | 3 +- src/test/unit/unit_overflow.cpp | 11 ++- src/test/unit/unit_playback.cpp | 51 ++++++++++ 5 files changed, 122 insertions(+), 54 deletions(-) create mode 100644 src/test/unit/unit_playback.cpp diff --git a/src/include/robin_hood.h b/src/include/robin_hood.h index f973be70..d023731d 100644 --- a/src/include/robin_hood.h +++ b/src/include/robin_hood.h @@ -36,7 +36,7 @@ // see https://semver.org/ #define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes #define ROBIN_HOOD_VERSION_MINOR 11 // for adding functionality in a backwards-compatible manner -#define ROBIN_HOOD_VERSION_PATCH 0 // for backwards-compatible bug fixes +#define ROBIN_HOOD_VERSION_PATCH 1 // for backwards-compatible bug fixes #include #include @@ -553,14 +553,6 @@ struct NodeAllocator { template struct NodeAllocator : public BulkPoolAllocator {}; -// dummy hash, unsed as mixer when robin_hood::hash is already used -template -struct identity_hash { - constexpr size_t operator()(T const& obj) const noexcept { - return static_cast(obj); - } -}; - // c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it either, so I'm making // my own here. namespace swappable { @@ -745,8 +737,10 @@ inline size_t hash_bytes(void const* ptr, size_t len) noexcept { } h ^= h >> r; - h *= m; - h ^= h >> r; + + // not doing the final step here, because this will be done by keyToIdx anyways + // h *= m; + // h ^= h >> r; return static_cast(h); } @@ -756,8 +750,10 @@ inline size_t hash_int(uint64_t x) noexcept { x ^= x >> 33U; x *= UINT64_C(0xff51afd7ed558ccd); x ^= x >> 33U; - x *= UINT64_C(0xc4ceb9fe1a85ec53); - x ^= x >> 33U; + + // not doing the final step here, because this will be done by keyToIdx anyways + // x *= UINT64_C(0xc4ceb9fe1a85ec53); + // x ^= x >> 33U; return static_cast(x); } @@ -1349,18 +1345,17 @@ class Table // The upper 1-5 bits need to be a reasonable good hash, to save comparisons. template void keyToIdx(HashKey&& key, size_t* idx, InfoType* info) const { - // for a user-specified hash that is *not* robin_hood::hash, apply robin_hood::hash as - // an additional mixing step. This serves as a bad hash prevention, if the given data is + // In addition to whatever hash is used, add another mul & shift so we get better hashing. + // This serves as a bad hash prevention, if the given data is // badly mixed. - using Mix = - typename std::conditional, hasher>::value, - ::robin_hood::detail::identity_hash, - ::robin_hood::hash>::type; + auto h = static_cast(WHash::operator()(key)); + + h *= mHashMultiplier; + h ^= h >> 33U; // the lower InitialInfoNumBits are reserved for info. - auto h = Mix{}(WHash::operator()(key)); *info = mInfoInc + static_cast((h & InfoMask) >> mInfoHashShift); - *idx = (h >> InitialInfoNumBits) & mMask; + *idx = (static_cast(h) >> InitialInfoNumBits) & mMask; } // forwards the index by one, wrapping around at the end @@ -1451,11 +1446,11 @@ class Table // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is resized. // @return True on success, false if something went wrong - bool insert_move(Node&& keyval) { + void insert_move(Node&& keyval) { // we don't retry, fail if overflowing // don't need to check max num elements if (0 == mMaxNumElementsAllowed && !try_increase_info()) { - return false; + throwOverflowError(); } size_t idx{}; @@ -1492,7 +1487,6 @@ class Table mInfo[insertion_idx] = insertion_info; ++mNumElements; - return true; } public: @@ -1542,6 +1536,7 @@ class Table , DataPool(std::move(static_cast(o))) { ROBIN_HOOD_TRACE(this) if (o.mMask) { + mHashMultiplier = std::move(o.mHashMultiplier); mKeyVals = std::move(o.mKeyVals); mInfo = std::move(o.mInfo); mNumElements = std::move(o.mNumElements); @@ -1560,6 +1555,7 @@ class Table if (o.mMask) { // only move stuff if the other map actually has some data destroy(); + mHashMultiplier = std::move(o.mHashMultiplier); mKeyVals = std::move(o.mKeyVals); mInfo = std::move(o.mInfo); mNumElements = std::move(o.mNumElements); @@ -1595,6 +1591,7 @@ class Table ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" << numElementsWithBuffer << ")") + mHashMultiplier = o.mHashMultiplier; mKeyVals = static_cast( detail::assertNotNull(std::malloc(numBytesTotal))); // no need for calloc because clonData does memcpy @@ -1662,6 +1659,7 @@ class Table WHash::operator=(static_cast(o)); WKeyEqual::operator=(static_cast(o)); DataPool::operator=(static_cast(o)); + mHashMultiplier = o.mHashMultiplier; mNumElements = o.mNumElements; mMask = o.mMask; mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; @@ -2085,9 +2083,7 @@ class Table // only actually do anything when the new size is bigger than the old one. This prevents to // continuously allocate for each reserve() call. if (newSize < mMask + 1) { - if (!rehashPowerOfTwo(newSize, true)) { - throwOverflowError(); - } + rehashPowerOfTwo(newSize, true); } } @@ -2195,16 +2191,14 @@ class Table // only actually do anything when the new size is bigger than the old one. This prevents to // continuously allocate for each reserve() call. if (forceRehash || newSize > mMask + 1) { - if (!rehashPowerOfTwo(newSize, false)) { - throwOverflowError(); - } + rehashPowerOfTwo(newSize, false); } } // reserves space for at least the specified number of elements. // only works if numBuckets if power of two // True on success, false otherwise - bool rehashPowerOfTwo(size_t numBuckets, bool forceFree) { + void rehashPowerOfTwo(size_t numBuckets, bool forceFree) { ROBIN_HOOD_TRACE(this) Node* const oldKeyVals = mKeyVals; @@ -2217,16 +2211,16 @@ class Table if (oldMaxElementsWithBuffer > 1) { for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) { if (oldInfo[i] != 0) { - if (!insert_move(std::move(oldKeyVals[i]))) { - return false; - } + // might throw an exception, which is really bad since we are in the middle of + // moving stuff. + insert_move(std::move(oldKeyVals[i])); // destroy the node but DON'T destroy the data. oldKeyVals[i].~Node(); } } - // this check is not necessary as it's guarded by the previous if, but it helps silence - // g++'s overeager "attempt to free a non-heap object 'map' + // this check is not necessary as it's guarded by the previous if, but it helps + // silence g++'s overeager "attempt to free a non-heap object 'map' // [-Werror=free-nonheap-object]" warning. if (oldKeyVals != reinterpret_cast_no_cast_align_warning(&mMask)) { // don't destroy old data: put it into the pool instead @@ -2237,7 +2231,6 @@ class Table } } } - return true; } ROBIN_HOOD(NOINLINE) void throwOverflowError() const { @@ -2332,11 +2325,11 @@ class Table enum class InsertionState { overflow_error, key_found, new_node, overwrite_node }; // Finds key, and if not already present prepares a spot where to pot the key & value. - // This potentially shifts nodes out of the way, updates mInfo and number of inserted elements, - // so the only operation left to do is create/assign a new node at that spot. + // This potentially shifts nodes out of the way, updates mInfo and number of inserted + // elements, so the only operation left to do is create/assign a new node at that spot. template std::pair insertKeyPrepareEmptySpot(OtherKey&& key) { - while (true) { + for (int i = 0; i < 256; ++i) { size_t idx{}; InfoType info{}; keyToIdx(key, &idx, &info); @@ -2382,6 +2375,9 @@ class Table ? InsertionState::new_node : InsertionState::overwrite_node); } + + // enough attempts failed, so finally give up. + return std::make_pair(size_t(0), InsertionState::overflow_error); } bool try_increase_info() { @@ -2429,12 +2425,25 @@ class Table << maxNumElementsAllowed << ", load=" << (static_cast(mNumElements) * 100.0 / (static_cast(mMask) + 1))) - // it seems we have a really bad hash function! don't try to resize again + + nextHashMultiplier(); if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) { - return false; + // we have to resize, even though there would still be plenty of space left! + // Try to rehash instead. Delete freed memory so we don't steadyily increase mem in case + // we have to rehash a few times + rehashPowerOfTwo(mMask + 1, true); + } else { + // Each resize use a different hash so we don't so easily overflow. + // Make sure we only have odd numbers, so that the multiplication is reversible! + rehashPowerOfTwo((mMask + 1) * 2, false); } + return true; + } - return rehashPowerOfTwo((mMask + 1) * 2, false); + void nextHashMultiplier() { + // adding an *even* number, so that the multiplier will always stay odd. This is necessary + // so that the hash stays a mixing function (and thus doesn't have any information loss). + mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54); } void destroy() { @@ -2467,13 +2476,14 @@ class Table } // members are sorted so no padding occurs - Node* mKeyVals = reinterpret_cast_no_cast_align_warning(&mMask); // 8 byte 8 - uint8_t* mInfo = reinterpret_cast(&mMask); // 8 byte 16 - size_t mNumElements = 0; // 8 byte 24 - size_t mMask = 0; // 8 byte 32 - size_t mMaxNumElementsAllowed = 0; // 8 byte 40 - InfoType mInfoInc = InitialInfoInc; // 4 byte 44 - InfoType mInfoHashShift = InitialInfoHashShift; // 4 byte 48 + uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53); // 8 byte 8 + Node* mKeyVals = reinterpret_cast_no_cast_align_warning(&mMask); // 8 byte 16 + uint8_t* mInfo = reinterpret_cast(&mMask); // 8 byte 24 + size_t mNumElements = 0; // 8 byte 32 + size_t mMask = 0; // 8 byte 40 + size_t mMaxNumElementsAllowed = 0; // 8 byte 48 + InfoType mInfoInc = InitialInfoInc; // 4 byte 52 + InfoType mInfoHashShift = InitialInfoHashShift; // 4 byte 56 // 16 byte 56 if NodeAllocator }; diff --git a/src/test/unit/CMakeLists.txt b/src/test/unit/CMakeLists.txt index 563bc529..767a35b5 100644 --- a/src/test/unit/CMakeLists.txt +++ b/src/test/unit/CMakeLists.txt @@ -56,6 +56,7 @@ target_sources_local(rh PRIVATE unit_initializerlist.cpp unit_insert_collision.cpp unit_insert_or_assign.cpp + unit_playback.cpp unit_insert.cpp unit_iterator_twice_bug.cpp unit_iterators_ctor.cpp diff --git a/src/test/unit/unit_iterator_twice_bug.cpp b/src/test/unit/unit_iterator_twice_bug.cpp index 6a678354..4a14083d 100644 --- a/src/test/unit/unit_iterator_twice_bug.cpp +++ b/src/test/unit/unit_iterator_twice_bug.cpp @@ -38,7 +38,8 @@ struct hash { } // namespace robin_hood -TEST_CASE("iterator_twice_bug") { +// don't run, this test only works when we know exactly where insertion happens +TEST_CASE("iterator_twice_bug" * doctest::skip()) { robin_hood::unordered_flat_map map; auto a = 31U + 1024U * 0U; diff --git a/src/test/unit/unit_overflow.cpp b/src/test/unit/unit_overflow.cpp index 8986fc21..d66df320 100644 --- a/src/test/unit/unit_overflow.cpp +++ b/src/test/unit/unit_overflow.cpp @@ -10,9 +10,12 @@ TYPE_TO_STRING(robin_hood::unordered_flat_map>); TYPE_TO_STRING(robin_hood::unordered_node_map>); -#if ROBIN_HOOD(HAS_EXCEPTIONS) +// This test doesn't work as it was intended with the hash overflow protection +#if 0 -TEST_CASE_TEMPLATE("bug overflow" * doctest::test_suite("stochastic"), Map, +# if ROBIN_HOOD(HAS_EXCEPTIONS) + +TEST_CASE_TEMPLATE("bug_overflow" * doctest::test_suite("stochastic"), Map, robin_hood::unordered_flat_map>, robin_hood::unordered_node_map>) { Map rh; @@ -43,4 +46,6 @@ TEST_CASE_TEMPLATE("bug overflow" * doctest::test_suite("stochastic"), Map, REQUIRE(checksum::map(rh) == checksum::map(uo)); } -#endif +# endif + +#endif \ No newline at end of file diff --git a/src/test/unit/unit_playback.cpp b/src/test/unit/unit_playback.cpp new file mode 100644 index 00000000..ce370619 --- /dev/null +++ b/src/test/unit/unit_playback.cpp @@ -0,0 +1,51 @@ +#include +#include + +#include +#include +#include + +// Loads the file "test-data-new.txt" and processes it +TEST_CASE("playback" * doctest::skip()) { + auto sets = std::map>(); + + auto fin = std::fstream("../test-data-3.txt"); + auto line = std::string(); + + auto prefix = std::string(); + auto id = size_t(); + auto command = std::string(); + auto lineNr = 0; + try { + while (fin >> prefix >> id >> command) { + ++lineNr; + + // std::cout << prefix << " " << id << " " << command << " "; + if (command == "insert") { + auto number = int32_t(); + fin >> number; + // std::cout << number << std::endl; + sets[id].insert(number); + } else if (command == "construct") { + // std::cout << std::endl; + sets[id]; + } else if (command == "move_construct") { + auto idMoved = size_t(); + fin >> idMoved; + // std::cout << idMoved << std::endl; + sets[id] = std::move(sets[idMoved]); + } else if (command == "destroy") { + // std::cout << std::endl; + sets.erase(id); + } else if (command == "clear") { + // std::cout << std::endl; + sets[id].clear(); + } else { + throw std::runtime_error(command); + } + } + } catch (...) { + std::cout << "line " << lineNr << std::endl; + throw; + } +}