microsoft · mjp41 · Jun 28, 2024 · Jun 17, 2024 · Jun 22, 2024 · Jun 19, 2024
diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h
@@ -96,33 +96,37 @@ namespace snmalloc
     // of allocators.
     SNMALLOC_SLOW_PATH static void ensure_init_slow()
     {
-      FlagLock lock{initialisation_lock};
+      if (initialised)
+        return;
+
+      with(initialisation_lock, [&]() {
 #ifdef SNMALLOC_TRACING
-      message<1024>("Run init_impl");
+        message<1024>("Run init_impl");
 #endif
 
-      if (initialised)
-        return;
+        if (initialised)
+          return;
 
-      LocalEntropy entropy;
-      entropy.init<Pal>();
-      // Initialise key for remote deallocation lists
-      RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key());
+        LocalEntropy entropy;
+        entropy.init<Pal>();
+        // Initialise key for remote deallocation lists
+        RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key());
 
-      // Need to randomise pagemap location. If requested and not a
-      // StrictProvenance architecture, randomize its table's location within a
-      // significantly larger address space allocation.
-      static constexpr bool pagemap_randomize =
-        mitigations(random_pagemap) && !aal_supports<StrictProvenance>;
+        // Need to randomise pagemap location. If requested and not a
+        // StrictProvenance architecture, randomize its table's location within
+        // a significantly larger address space allocation.
+        static constexpr bool pagemap_randomize =
+          mitigations(random_pagemap) && !aal_supports<StrictProvenance>;
 
-      Pagemap::concretePagemap.template init<pagemap_randomize>();
+        Pagemap::concretePagemap.template init<pagemap_randomize>();
 
-      if constexpr (aal_supports<StrictProvenance>)
-      {
-        Authmap::init();
-      }
+        if constexpr (aal_supports<StrictProvenance>)
+        {
+          Authmap::init();
+        }
 
-      initialised.store(true, std::memory_order_release);
+        initialised.store(true, std::memory_order_release);
+      });
     }
 
   public:

diff --git a/src/snmalloc/backend_helpers/lockrange.h b/src/snmalloc/backend_helpers/lockrange.h
@@ -22,7 +22,7 @@ namespace snmalloc
        * This is infrequently used code, a spin lock simplifies the code
        * considerably, and should never be on the fast path.
        */
-      FlagWord spin_lock{};
+      CombiningLock spin_lock{};
 
     public:
       static constexpr bool Aligned = ParentRange::Aligned;
@@ -35,14 +35,18 @@ namespace snmalloc
 
       CapPtr<void, ChunkBounds> alloc_range(size_t size)
       {
-        FlagLock lock(spin_lock);
-        return parent.alloc_range(size);
+        CapPtr<void, ChunkBounds> result;
+        with(spin_lock, [&]() {
+          {
+            result = parent.alloc_range(size);
+          }
+        });
+        return result;
       }
 
       void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
       {
-        FlagLock lock(spin_lock);
-        parent.dealloc_range(base, size);
+        with(spin_lock, [&]() { parent.dealloc_range(base, size); });
       }
     };
   };

diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h
@@ -0,0 +1,224 @@
+#pragma once
+
+#include "../aal/aal.h"
+#include "../pal/pal.h"
+
+#include <atomic>
+#include <functional>
+
+namespace snmalloc
+{
+  class CombineLockNode;
+
+  struct CombiningLock
+  {
+    // Fast path lock incase there is no contention.
+    std::atomic<bool> flag{false};
+
+    // MCS queue of work items
+    std::atomic<CombineLockNode*> head{nullptr};
+  };
+
+  /**
+   * @brief Combinations of MCS queue lock with Flat Combining
+   *
+   * Each element in the queue has a pointer to a work item.
+   * This means when under contention the thread holding the lock
+   * can perform the work.
+   *
+   * As the work items are arbitrary lambdas there are no simplifications
+   * for combining related work items.  I.e. original Flat Combining paper
+   * might sort a collection of inserts, and perform them in a single traversal.
+   *
+   * Note that, we should perhaps add a Futex/WakeOnAddress mode to improve
+   * performance in the contended case, rather than spinning.
+   */
+  class CombineLockNode
+  {
+    template<typename F>
+    friend class CombineLockNodeTempl;
+
+    enum class LockStatus
+    {
+      // The work for this node has not been completed.
+      WAITING,
+
+      // The work for this thread has been completed, and it is not the
+      // last element in the queue.
+      DONE,
+
+      // The work for this thread has not been completed, and it is the
+      // head of the queue.
+      READY
+    };
+
+    // Status of the queue, set by the thread at the head of the queue,
+    // When it makes the thread for this node either the head of the queue
+    // or completes its work.
+    std::atomic<LockStatus> status{LockStatus::WAITING};
+
+    // Used to store the queue
+    std::atomic<CombineLockNode*> next{nullptr};
+
+    // Stores the C++ lambda associated with this node in the queue.
+    void (*f_raw)(CombineLockNode*);
+
+    void release(CombiningLock& lock)
+    {
+      lock.flag.store(false, std::memory_order_release);
+    }
+
+    void set_status(LockStatus s)
+    {
+      status.store(s, std::memory_order_release);
+    }
+
+    constexpr CombineLockNode(void (*f)(CombineLockNode*)) : f_raw(f) {}
+
+    SNMALLOC_FAST_PATH void attach(CombiningLock& lock)
+    {
+      // Test if no one is waiting
+      if (lock.head.load(std::memory_order_relaxed) == nullptr)
+      {
+        // No one was waiting so low contention. Attempt to acquire the flag
+        // lock.
+        if (lock.flag.exchange(true, std::memory_order_acquire) == false)
+        {
+          // We grabbed the lock.
+          f_raw(this);
+
+          // Release the lock
+          release(lock);
+          return;
+        }
+      }
+      attach_slow(lock);
+    }
+
+    SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock)
+    {
+      // There is contention for the lock, we need to add our work to the
+      // queue of pending work
+      auto prev = lock.head.exchange(this, std::memory_order_acq_rel);
+
+      if (prev != nullptr)
+      {
+        // If we aren't the head, link into predecessor
+        prev->next.store(this, std::memory_order_release);
+
+        // Wait to for predecessor to complete
+        while (status.load(std::memory_order_relaxed) == LockStatus::WAITING)
+          Aal::pause();
+
+        // Determine if another thread completed our work.
+        if (status.load(std::memory_order_acquire) == LockStatus::DONE)
+          return;
+      }
+      else
+      {
+        // We are the head of the queue. Spin until we acquire the fast path
+        // lock.  As we are in the queue future requests shouldn't try to
+        // acquire the fast path lock, but stale views of the queue being empty
+        // could still be concurrent with this thread.
+        while (lock.flag.exchange(true, std::memory_order_acquire))
+        {
+          while (lock.flag.load(std::memory_order_relaxed))
+          {
+            Aal::pause();
+          }
+        }
+
+        // We could set
+        //    status = LockStatus::Ready
+        // However, the subsequent state assumes it is Ready, and
+        // nothing would read it.
+      }
+
+      // We are the head of the queue, and responsible for
+      // waking/performing our and subsequent work.
+      auto curr = this;
+      while (true)
+      {
+        // Perform work for head of the queue
+        curr->f_raw(curr);
+
+        // Determine if there are more elements.
+        auto n = curr->next.load(std::memory_order_acquire);
+        if (n != nullptr)
+        {
+          // Signal this work was completed and move on to
+          // next item.
+          curr->set_status(LockStatus::DONE);
+          curr = n;
+          continue;
+        }
+
+        // This could be the end of the queue, attempt to close the
+        // queue.
+        auto curr_c = curr;
+        if (lock.head.compare_exchange_strong(
+              curr_c,
+              nullptr,
+              std::memory_order_release,
+              std::memory_order_relaxed))
+        {
+          // Queue was successfully closed.
+          // Notify last element the work was completed.
+          curr->set_status(LockStatus::DONE);
+          release(lock);
+          return;
+        }
+
+        // Failed to close the queue wait for next thread to be
+        // added.
+        while (curr->next.load(std::memory_order_relaxed) == nullptr)
+          Aal::pause();
+
+        // As we had to wait, give the job to the next thread
+        // to carry on performing the work.
+        n = curr->next.load(std::memory_order_acquire);
+        n->set_status(LockStatus::READY);
+
+        // Notify the thread that we completed its work.
+        // Note that this needs to be done last, as we can't read
+        // curr->next after setting curr->status
+        curr->set_status(LockStatus::DONE);
+        return;
+      }
+    }
+  };
+
+  template<typename F>
+  class CombineLockNodeTempl : CombineLockNode
+  {
+    template<typename FF>
+    friend void with(CombiningLock&, FF&&);
+
+    // This holds the closure for the lambda
+    F f;
+
+    // Untyped version of calling f to store in the node.
+    static void invoke(CombineLockNode* self)
+    {
+      auto self_templ = reinterpret_cast<CombineLockNodeTempl*>(self);
+      self_templ->f();
+    }
+
+    CombineLockNodeTempl(CombiningLock& lock, F&& f_)
+    : CombineLockNode(invoke), f(f_)
+    {
+      attach(lock);
+    }
+  };
+
+  /**
+   * Lock primitive. This takes a reference to a Lock, and a thunk to
+   * call when the lock is available.  The thunk should be independent of
+   * the current thread as the thunk may be executed by a different thread.
+   */
+  template<typename F>
+  inline void with(CombiningLock& lock, F&& f)
+  {
+    CombineLockNodeTempl<F> node{lock, std::forward<F>(f)};
+  }
+} // namespace snmalloc
diff --git a/src/snmalloc/ds/ds.h b/src/snmalloc/ds/ds.h
@@ -6,6 +6,7 @@
 #include "../pal/pal.h"
 #include "aba.h"
 #include "allocconfig.h"
+#include "combininglock.h"
 #include "entropy.h"
 #include "flaglock.h"
 #include "mpmcstack.h"

diff --git a/src/snmalloc/ds/flaglock.h b/src/snmalloc/ds/flaglock.h
@@ -133,4 +133,11 @@ namespace snmalloc
       lock.flag.store(false, std::memory_order_release);
     }
   };
+
+  template<typename F>
+  inline void with(FlagWord& lock, F&& f)
+  {
+    FlagLock l(lock);
+    f();
+  }
 } // namespace snmalloc
diff --git a/src/snmalloc/ds/singleton.h b/src/snmalloc/ds/singleton.h
@@ -35,14 +35,15 @@ namespace snmalloc
 
       if (SNMALLOC_UNLIKELY(!initialised.load(std::memory_order_acquire)))
       {
-        FlagLock lock(flag);
-        if (!initialised)
-        {
-          init(&obj);
-          initialised.store(true, std::memory_order_release);
-          if (first != nullptr)
-            *first = true;
-        }
+        with(flag, [&]() {
+          if (!initialised)
+          {
+            init(&obj);
+            initialised.store(true, std::memory_order_release);
+            if (first != nullptr)
+              *first = true;
+          }
+        });
       }
       return obj;
     }