Add LockfreeUnorderedSet

nilsdeppe · nilsdeppe · commit 520124d670f5 · 2025-10-21T09:51:33.000-04:00
diff --git a/src/Parallel/CMakeLists.txt b/src/Parallel/CMakeLists.txt
@@ -45,6 +45,7 @@ spectre_target_headers(
   InitializationFunctions.hpp
   Invoke.hpp
   Local.hpp
+  LockfreeUnorderedSet.hpp
   Main.hpp
   MaxInlineMethodsReached.hpp
   MultiReaderSpinlock.hpp
diff --git a/src/Parallel/LockfreeUnorderedSet.hpp b/src/Parallel/LockfreeUnorderedSet.hpp
@@ -0,0 +1,293 @@
+// Distributed under the MIT License.
+// See LICENSE.txt for details.
+
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <new>      // for hardware_destructive_interference_size
+#include <optional>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+
+#include "Utilities/ErrorHandling/Error.hpp"
+#include "Utilities/ForceInline.hpp"
+
+namespace Parallel {
+/*!
+ * \brief A lockfree multi-producer multi-consumer unordered set.
+ *
+ * An unordered set where keys of type `T` are stored directly (i.e. not
+ * hashed). As a result, there is the constraint that
+ * `sizeof(T) <= sizeof(std::uint64_t)`, i.e. the key type `T` must be 8 bytes
+ * or less.
+ *
+ * In order to provide fully lockfree semantics, a capacity must chosen at
+ * construction. If the capacity is reached, no more keys can be inserted. For
+ * improved performance, the capacity must be a positive power of 2,
+ * i.e. `2^N` for `N>-1`.
+ *
+ * Ideally the values of `T` are reasonably well distributed so as to avoid
+ * collisions, but collisions are supported by linear probing. I.e., if two
+ * keys would point to the same internal location, the location for the second
+ * inserted key is obtained by linearly searching for the next empty slot. In
+ * practice this means that if the load factor reaches over about 50%,
+ * performance degradation should be expected. This can be resolved by
+ * creating an unordered set with a larger the maximum capacity. Linear
+ * probing makes operations O(1) in the best case scenario, and O(2^N)
+ * (i.e. the capacity of the unordered set) worst case.
+ *
+ * Users can optionally specify the sentinel value used to mark a slot as
+ * empty by specifying the `EmptySlotValue`. By default the
+ * `EmptySlotValue` is 0. It is undefined behavior if a key with the
+ * value of the `EmptySlotValue` is inserted. No diagnostic is provided.
+ *
+ * \warning This class does not synchronize memory, which means while all
+ * operations on the container is atomic, it cannot be used to synchronize
+ * data not contained in the set across different cores.
+ */
+template <class T, std::uint64_t EmptySlotValue = 0,
+          bool ForceCachelineAlignment = true>
+class LockfreeUnorderedSet {
+ private:
+#ifdef __cpp_lib_hardware_interference_size
+  static constexpr size_t cache_line_size_ =
+      std::hardware_destructive_interference_size;
+#else
+  static constexpr size_t cache_line_size_ = 64;
+#endif
+
+ public:
+  static_assert(sizeof(T) <= sizeof(std::uint64_t));
+
+  /*!
+   * \brief Create a multi-producer multi-consumer unordered set that allows
+   * at most `capacity` objects.
+   *
+   * \warning \p capacity must be a power of two greater than 0.
+   */
+  explicit LockfreeUnorderedSet(size_t capacity);
+  // Delete copy and move constructors and assignment operators since this
+  // class stores atomic variables needed for thread-safety.
+  LockfreeUnorderedSet(const LockfreeUnorderedSet&) = delete;
+  LockfreeUnorderedSet& operator=(const LockfreeUnorderedSet&) = delete;
+  LockfreeUnorderedSet(LockfreeUnorderedSet&&) = delete;
+  LockfreeUnorderedSet& operator=(LockfreeUnorderedSet&&) = delete;
+  ~LockfreeUnorderedSet() = default;
+
+  /*!
+   * \brief Insert the \p key into the set.
+   *
+   * \param key The key to insert.
+   * \param max_linear_probes The maximum number of linear probes to perform
+   *        before we give up. If `std::nullopt` (the default), then the maximum
+   *        number of linear probes is the container capacity (`capacity`).
+   *        This can be used to reduce the worst case linear probing cost from
+   *        O(capacity) to O(\p max_linear_probes) but users should be certain
+   *        this will not introduce bugs in their code.
+   * \return `true` if the key was inserted or found in the set. Returns
+   *         `false` if we failed to insert the key because we reached
+   *         \p max_linear_probes.
+   */
+  [[nodiscard]] bool insert(
+      T key, std::optional<size_t> max_linear_probes = std::nullopt) noexcept;
+
+  /*!
+   * \brief Erase the \p key from the set.
+   *
+   * \param key The key to erase.
+   * \param max_linear_probes The maximum number of linear probes to perform
+   *        before we give up. If `std::nullopt` (the default), then the maximum
+   *        number of linear probes is the container capacity (`capacity`).
+   *        This can be used to reduce the worst case linear probing cost from
+   *        O(capacity) to O(\p max_linear_probes) but users should be certain
+   *        this will not introduce bugs in their code.
+   * \return `true` if the key was erased by this thread. Returns `false` if we
+   *         failed to erase the key because we reached \p max_linear_probes
+   *         or because another thread erased the key. We have no general way of
+   *         knowing why we could not find the \p key to erase.
+   */
+  [[nodiscard]] bool erase(
+      T key, std::optional<size_t> max_linear_probes = std::nullopt) noexcept;
+
+  /*!
+   * \brief Check if the unordered set contains the \p key.
+   *
+   * \param key The key to check if it is contained in the unordered set.
+   * \param max_linear_probes The maximum number of linear probes to perform
+   *        before we give up. If `std::nullopt` (the default), then the maximum
+   *        number of linear probes is the container capacity (`capacity`).
+   *        This can be used to reduce the worst case linear probing cost from
+   *        O(capacity) to O(\p max_linear_probes) but users should be certain
+   *        this will not introduce bugs in their code.
+   * \return `true` if the \p key is found, `false` if not.
+   *
+   * \warning A \p key may be found in the unordered set and then immediately
+   * erased by another thread.
+   */
+  [[nodiscard]] bool contains(T key, std::optional<size_t> max_linear_probes =
+                                         std::nullopt) const noexcept;
+
+  /// \brief Returns the capacity.
+  constexpr size_t capacity() const noexcept { return entries_.size(); }
+
+ private:
+  [[nodiscard]] constexpr SPECTRE_ALWAYS_INLINE std::uint64_t
+  compute_internal_key(const T key) const {
+    return static_cast<std::uint64_t>(
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        *reinterpret_cast<const std::conditional_t<
+            sizeof(T) == 8, std::uint64_t,
+            std::conditional_t<sizeof(T) == 4, std::uint32_t,
+                               std::conditional_t<sizeof(T) == 2, std::uint16_t,
+                                                  std::uint8_t>>>*>(&key));
+  }
+
+  // Wrap the retrieval to easily support both cacheline-aligned and unaligned
+  // data. We force inline, noexcept and constexpr this function to try and
+  // ensure zero runtime overhead.
+  [[nodiscard]] constexpr SPECTRE_ALWAYS_INLINE std::atomic<std::uint64_t>& get(
+      const std::uint64_t index) noexcept {
+    if constexpr (ForceCachelineAlignment) {
+      return entries_[index].value;
+    } else {
+      return entries_[index];
+    }
+  }
+  [[nodiscard]] constexpr SPECTRE_ALWAYS_INLINE const
+      std::atomic<std::uint64_t>&
+      get(const std::uint64_t index) const noexcept {
+    if constexpr (ForceCachelineAlignment) {
+      return entries_[index].value;
+    } else {
+      return entries_[index];
+    }
+  }
+
+  // Since atomics may not always be lock free depending on alignment, we want
+  // to catch issues where the hardware cannot guarantee that the atomic is
+  // lock free. It is not inherently bad that we cannot guarantee that the
+  // atomics are lock free at compile time, we could check at runtime, but
+  // it's nice to have the guarantee when possible. If for some reason the
+  // guarantee isn't given, then likely forcing alignment of the individual
+  // atomic variables would restore it. For example, some system may only be
+  // able to handle atomics on 32-byte word boundaries.
+  static_assert(std::atomic<std::uint64_t>::is_always_lock_free);
+
+  struct alignas(cache_line_size_) AlignedEntry {
+    std::atomic<std::uint64_t> value{0};
+  };
+
+  alignas(cache_line_size_)
+      std::vector<std::conditional_t<ForceCachelineAlignment, AlignedEntry,
+                                     std::atomic<std::uint64_t>>> entries_{};
+  // Ensure we pad the end to avoid false sharing. Unlikely to happen because
+  // of the layout, but better to be safe.
+  char padding_[cache_line_size_] = {};  // NOLINT(modernize-avoid-c-arrays)
+};
+
+template <class T, std::uint64_t EmptySlotValue, bool ForceCachelineAlignment>
+LockfreeUnorderedSet<T, EmptySlotValue, ForceCachelineAlignment>::
+    LockfreeUnorderedSet(const size_t capacity)
+    : entries_(capacity) {
+  if (capacity == 0) {
+    ERROR("The capacity must be a power of two larger than 0. Got "
+          << capacity);
+  }
+  if ((capacity bitand (capacity - 1)) != 0) {
+    ERROR("The capacity must be a power of two larger than 0. Got "
+          << capacity);
+  }
+  // Explicitly zero out the counters.
+  for (auto& t : entries_) {
+    if constexpr (ForceCachelineAlignment) {
+      t.value.store(EmptySlotValue, std::memory_order_relaxed);
+    } else {
+      t.store(EmptySlotValue, std::memory_order_relaxed);
+    }
+  }
+}
+
+template <class T, std::uint64_t EmptySlotValue, bool ForceCachelineAlignment>
+bool LockfreeUnorderedSet<T, EmptySlotValue, ForceCachelineAlignment>::insert(
+    const T key, const std::optional<size_t> max_linear_probes) noexcept {
+  const size_t counter_end = max_linear_probes.value_or(entries_.size());
+
+  const std::uint64_t internal_key = compute_internal_key(key);
+  for (std::uint64_t index = internal_key, counter = 0; counter < counter_end;
+       (void)++index, (void)++counter) {  // loop for linear probing
+    index = index bitand (entries_.size() - 1);
+    const std::uint64_t probed_internal_key =
+        get(index).load(std::memory_order_relaxed);
+    if (probed_internal_key == internal_key) {
+      return true;  // set already contains value
+    } else {
+      if (probed_internal_key != EmptySlotValue) {
+        // The entry is used by another key.
+        continue;
+      }
+      // The entry is empty. Let's try to set it.
+      std::uint64_t current_key_in_slot = EmptySlotValue;
+      if (not get(index).compare_exchange_strong(
+              current_key_in_slot, internal_key, std::memory_order_relaxed,
+              std::memory_order_relaxed) and
+          current_key_in_slot != internal_key) {
+        // Another thread just stole this slot from us. Try next slot.
+        continue;
+      }
+      // Successful insert. Return.
+      return true;
+    }
+  }
+  return false;
+}
+
+template <class T, std::uint64_t EmptySlotValue, bool ForceCachelineAlignment>
+bool LockfreeUnorderedSet<T, EmptySlotValue, ForceCachelineAlignment>::erase(
+    const T key, const std::optional<size_t> max_linear_probes) noexcept {
+  const size_t counter_end = max_linear_probes.value_or(entries_.size());
+
+  const std::uint64_t internal_key = compute_internal_key(key);
+  for (std::uint64_t index = internal_key, counter = 0; counter < counter_end;
+       (void)++index, (void)++counter) {  // loop for linear probing
+    index = index bitand (entries_.size() - 1);
+    const std::uint64_t probed_internal_key =
+        get(index).load(std::memory_order_relaxed);
+    if (probed_internal_key == internal_key) {
+      std::uint64_t current_key_in_slot = internal_key;
+      if (not get(index).compare_exchange_strong(
+              current_key_in_slot, EmptySlotValue, std::memory_order_relaxed,
+              std::memory_order_relaxed) and
+          current_key_in_slot != EmptySlotValue) {
+        // If the CAS failed and the slot value is not the EmptySlotValue,
+        // then we failed to erase. E.g. another thread could have erased this
+        // value.
+        // NOLINTNEXTLINE(readability-simplify-boolean-expr)
+        return false;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+template <class T, std::uint64_t EmptySlotValue, bool ForceCachelineAlignment>
+bool LockfreeUnorderedSet<T, EmptySlotValue, ForceCachelineAlignment>::contains(
+    const T key, const std::optional<size_t> max_linear_probes) const noexcept {
+  const size_t counter_end = max_linear_probes.value_or(entries_.size());
+
+  const std::uint64_t internal_key = compute_internal_key(key);
+  for (std::uint64_t index = internal_key, counter = 0; counter < counter_end;
+       (void)++index, (void)++counter) {  // loop for linear probing
+    index = index bitand (entries_.size() - 1);
+    const std::uint64_t probed_internal_key =
+        get(index).load(std::memory_order_relaxed);
+    if (probed_internal_key == internal_key) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace Parallel
diff --git a/tests/Unit/Parallel/CMakeLists.txt b/tests/Unit/Parallel/CMakeLists.txt
@@ -166,6 +166,7 @@ set(LIBRARY_SOURCES
   Test_ExitCode.cpp
   Test_GlobalCacheDataBox.cpp
   Test_InboxInserters.cpp
+  Test_LockfreeUnorderedSet.cpp
   Test_MemoryMonitor.cpp
   Test_MultiReaderSpinlock.cpp
   Test_NodeLock.cpp
diff --git a/tests/Unit/Parallel/Test_LockfreeUnorderedSet.cpp b/tests/Unit/Parallel/Test_LockfreeUnorderedSet.cpp
@@ -0,0 +1,56 @@
+// Distributed under the MIT License.
+// See LICENSE.txt for details.
+
+#include "Framework/TestingFramework.hpp"
+
+#include "Domain/Structure/ElementId.hpp"
+#include "Parallel/LockfreeUnorderedSet.hpp"
+
+SPECTRE_TEST_CASE("Unit.Parallel.LockfreeUnorderedSet", "[Parallel][Unit]") {
+  const size_t capacity = 32;
+  Parallel::LockfreeUnorderedSet<ElementId<3>> set{capacity};
+
+  CHECK(set.capacity() == capacity);
+  CHECK(set.insert(ElementId<3>{0}));
+  CHECK(set.capacity() == capacity);
+  CHECK(set.insert(ElementId<3>{1}));
+  CHECK(set.capacity() == capacity);
+  CHECK(set.insert(ElementId<3>{3}));
+  CHECK(set.capacity() == capacity);
+  CHECK(set.insert(ElementId<3>{3}));
+  CHECK(set.capacity() == capacity);
+  CHECK(set.contains(ElementId<3>{3}));
+  CHECK(set.contains(ElementId<3>{0}));
+  CHECK_FALSE(set.contains(ElementId<3>{7}));
+  CHECK(set.contains(ElementId<3>{1}));
+
+  // At this point we have 3 members, so add another capacity-3=29
+  for (size_t i = 0; i < 29; ++i) {
+    CHECK(set.insert(ElementId<3>{i + 5}));
+  }
+  CHECK_FALSE(set.insert(ElementId<3>{50}));
+  CHECK_FALSE(set.erase(ElementId<3>{50}));
+
+  for (size_t i = 0; i < 29; ++i) {
+    CHECK(set.contains(ElementId<3>{i + 5}));
+  }
+
+  // Check that if we shorten the linear probing we actually fail to correctly
+  // evaluate `contains`. Note: if the layout of ElementId<3> changes, then we
+  // might need to change this.
+  // We figured out colliding_value by adding
+  //   CAPTURE(i);
+  //   CHECK(set.contains(ElementId<3>{i + 5}, i));
+  // in the for loop above.
+  const size_t colliding_value = 27 + 5;
+  CHECK_FALSE(set.contains(ElementId<3>{colliding_value}, 1));
+  CHECK(set.contains(ElementId<3>{colliding_value}));
+  CHECK_FALSE(set.erase(ElementId<3>{colliding_value}, 1));
+  CHECK(set.erase(ElementId<3>{colliding_value}));
+  CHECK_FALSE(set.insert(ElementId<3>{colliding_value}, 1));
+  CHECK(set.insert(ElementId<3>{colliding_value}));
+
+  for (size_t i = 0; i < 29; ++i) {
+    CHECK(set.erase(ElementId<3>{i + 5}));
+  }
+}
diff --git a/tools/FileTestDefs.sh b/tools/FileTestDefs.sh
@@ -636,6 +636,7 @@ noexcept() {
     whitelist "$1" \
               "src/Utilities/StdHelpers/Bit.hpp" \
               "src/Parallel/MultiReaderSpinlock.hpp$" \
+              "src/Parallel/LockfreeUnorderedSet.hpp$" \
               'src/Parallel/StaticSpscQueue.hpp' \
               "src/Parallel/NodeLock..pp$" \
               "src/Evolution/DiscontinuousGalerkin/\