[yugabyte#26818] DocDB: Pull RWQueue from libcds and use it in rpc::ThreadPool

spolitov · spolitov · commit 09769f9b39d0 · 2025-04-23T07:51:02.000+03:00
Summary: **libcds** implements a lock-free queues in C++ (which lacks garbage collection) using hazard pointers. However, using its lock-free containers requires attaching each thread to the libcds manager, adding unnecessary friction—especially when working with `std::async`. **libcds** also provides **RWQueue**, a non-lock-free but highly optimized concurrent queue. In benchmarks like `LockfreeTest.QueuePerformance`, it often outperforms lock-free alternatives. Pulled RWQueue's implementation our codebase. Also introduced `TrivialSpinlock` that outperforms `simple_spinlock` when used with RWQueue. The results for RpcStubTest.TestRpcPerformance this diff on n2-standard-64: ``` Total: 258.347ms, calls per second: 193538 (5.000us per call, NOT latency), slow calls: 0% Total: 277.650ms, calls per second: 180082 (5.000us per call, NOT latency), slow calls: 0% Total: 344.553ms, calls per second: 145115 (6.000us per call, NOT latency), slow calls: 0% Total: 289.964ms, calls per second: 172435 (5.000us per call, NOT latency), slow calls: 0% ``` master: ``` Total: 273.951ms, calls per second: 182514 (5.000us per call, NOT latency), slow calls: 0% Total: 341.373ms, calls per second: 146467 (6.000us per call, NOT latency), slow calls: 0% Total: 247.243ms, calls per second: 202230 (4.000us per call, NOT latency), slow calls: 0% Total: 344.846ms, calls per second: 144992 (6.000us per call, NOT latency), slow calls: 0% ``` Jira: DB-16208 Test Plan: Jenkins Reviewers: hsunder Reviewed By: hsunder Subscribers: esheng, ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D43194
diff --git a/src/yb/rpc/thread_pool.cc b/src/yb/rpc/thread_pool.cc
@@ -21,9 +21,7 @@
 
 #include <boost/intrusive/list.hpp>
 
-#include <cds/container/basket_queue.h>
-#include <cds/gc/dhp.h>
-
+#include "yb/util/concurrent_queue.h"
 #include "yb/util/flags.h"
 #include "yb/util/lockfree.h"
 #include "yb/util/scope_exit.h"
@@ -43,7 +41,7 @@ namespace {
 
 class Worker;
 
-using TaskQueue = cds::container::BasketQueue<cds::gc::DHP, ThreadPoolTask*>;
+using TaskQueue = RWQueue<ThreadPoolTask*>;
 using WaitingWorkers = LockFreeStack<Worker>;
 
 struct ThreadPoolShare {
diff --git a/src/yb/util/concurrent_queue.h b/src/yb/util/concurrent_queue.h
@@ -0,0 +1,134 @@
+// Copyright (c) YugabyteDB, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+// in compliance with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied.  See the License for the specific language governing permissions and limitations
+// under the License.
+//
+
+#pragma once
+
+#include "yb/util/locks.h"
+
+namespace yb {
+
+// RWQueue implementation from [1998] Maged Michael, Michael Scott
+// "Simple, fast, and practical non-blocking and blocking concurrent queue algorithms"
+template <typename T>
+class RWQueue {
+ public:
+  using value_type = T;
+
+  RWQueue() {
+    head_.node = tail_.node = new Node;
+  }
+
+  ~RWQueue() {
+    clear();
+    DCHECK_EQ(head_.node, tail_.node);
+    delete head_.node;
+  }
+
+  template <class... Args>
+  void Push(Args&&... value) {
+    auto node = new Node(std::forward<Args>(value)...);
+    std::lock_guard lock(tail_.mutex);
+    tail_.node->next.store(node, std::memory_order_release);
+    tail_.node = node;
+  }
+
+  template <class... Args>
+  bool push(Args&&... value) {
+    Push(std::forward<Args>(value)...);
+    return true;
+  }
+
+  bool Pop(value_type& value) {
+    return DoPop(value);
+  }
+
+  bool pop(value_type& value) {
+    return Pop(value);
+  }
+
+  std::optional<value_type> Pop() {
+    std::optional<value_type> result;
+    DoPop(result);
+    return result;
+  }
+
+  std::optional<value_type> pop() {
+    return Pop();
+  }
+
+  void Clear() {
+    Node* head;
+    Node* tail;
+    {
+      std::lock_guard lock_head(head_.mutex);
+      std::lock_guard lock_tail(tail_.mutex);
+      head = head_.node;
+      tail = tail_.node;
+      head_.node = tail;
+    }
+    while (head != tail) {
+      auto* next = head->next.load(std::memory_order_relaxed);
+      if (!next) {
+        break;
+      }
+      delete head;
+      head = next;
+    }
+  }
+
+  void clear() {
+    Clear();
+  }
+
+  bool empty() const {
+    std::lock_guard lock(head_.mutex);
+    return head_.node->next.load(std::memory_order_relaxed) == nullptr;
+  }
+
+ private:
+  template <class Out>
+  bool DoPop(Out& value) {
+    Node* node;
+    {
+      std::lock_guard lock(head_.mutex);
+      node = head_.node;
+      auto new_head = node->next.load(std::memory_order_acquire);
+      if (!new_head) {
+        return false;
+      }
+      value = std::move(new_head->value);
+      head_.node = new_head;
+    }
+    delete node;
+    return true;
+  }
+
+  struct Node {
+    std::atomic<Node*> next{nullptr};
+    value_type value;
+
+    template <typename... Args>
+    explicit Node(Args&&... args)
+        : value(std::forward<Args>(args)...) {}
+  };
+
+  struct EndType {
+    mutable TrivialSpinlock mutex;
+    Node* node;
+  };
+
+  alignas(CACHELINE_SIZE) EndType head_;
+  alignas(CACHELINE_SIZE) EndType tail_;
+};
+
+}  // namespace yb
diff --git a/src/yb/util/lockfree-test.cc b/src/yb/util/lockfree-test.cc
@@ -14,6 +14,7 @@
 //
 
 #include <atomic>
+#include <regex>
 #include <string>
 #include <thread>
 
@@ -27,6 +28,8 @@
 #include <cds/gc/dhp.h>
 #include <gtest/gtest.h>
 
+#include "yb/util/concurrent_queue.h"
+#include "yb/util/flags.h"
 #include "yb/util/lockfree.h"
 #include "yb/util/logging.h"
 #include "yb/util/monotime.h"
@@ -35,6 +38,9 @@
 #include "yb/util/thread.h"
 #include "yb/util/tsan_util.h"
 
+DEFINE_test_flag(string, queue_name_regex, "",
+    "Regex to filter queue by name in LockfreeTest.QueuePerformance test");
+
 using namespace std::literals;
 
 namespace yb {
@@ -44,13 +50,14 @@ struct TestEntry : public MPSCQueueEntry<TestEntry> {
   size_t index;
 };
 
-TEST(LockfreeTest, MPSCQueueSimple) {
+template<class Queue, class NoneValue>
+void TestQueueSimple(const NoneValue& none_value) {
   const size_t kTotalEntries = 10;
   std::vector<TestEntry> entries(kTotalEntries);
   for (size_t i = 0; i != entries.size(); ++i) {
     entries[i].index = i;
   }
-  MPSCQueue<TestEntry> queue;
+  Queue queue;
 
   // Push pop 1 entry
   queue.Push(&entries[0]);
@@ -65,6 +72,13 @@ TEST(LockfreeTest, MPSCQueueSimple) {
     ASSERT_EQ(&entry, queue.Pop());
   }
 
+  for (auto& entry : entries) {
+    queue.Push(&entry);
+  }
+
+  queue.Clear();
+  ASSERT_EQ(none_value, queue.Pop());
+
   // Mixed push and pop
   queue.Push(&entries[0]);
   queue.Push(&entries[1]);
@@ -82,12 +96,20 @@ TEST(LockfreeTest, MPSCQueueSimple) {
   ASSERT_EQ(&entries[5], queue.Pop());
   ASSERT_EQ(&entries[6], queue.Pop());
   ASSERT_EQ(&entries[7], queue.Pop());
-  ASSERT_EQ(nullptr, queue.Pop());
+  ASSERT_EQ(none_value, queue.Pop());
   queue.Push(&entries[8]);
   queue.Push(&entries[9]);
   ASSERT_EQ(&entries[8], queue.Pop());
   ASSERT_EQ(&entries[9], queue.Pop());
-  ASSERT_EQ(nullptr, queue.Pop());
+  ASSERT_EQ(none_value, queue.Pop());
+}
+
+TEST(LockfreeTest, MPSCQueueSimple) {
+  TestQueueSimple<MPSCQueue<TestEntry>>(nullptr);
+}
+
+TEST(LockfreeTest, RWQueueSimple) {
+  TestQueueSimple<RWQueue<TestEntry*>>(std::nullopt);
 }
 
 TEST(LockfreeTest, MPSCQueueConcurrent) {
@@ -281,6 +303,7 @@ class QueuePerformanceHelper {
         cds::container::optimistic_queue::make_traits<OptAllocator>::type>>(
             "OptimisticQueue/BlockAllocator/DHP");
     TestQueue<cds::container::RWQueue<ptrdiff_t>>("RWQueue");
+    TestQueue<RWQueue<ptrdiff_t>>("YBRWQueue");
     // On GCC11, segmented queue seems to call sized delete with a different size than it allocates
     // with, which causes a segfault in tcmalloc.
     // See issue https://github.com/khizmax/libcds/issues/181.
@@ -368,7 +391,7 @@ class QueuePerformanceHelper {
     start_latch.Wait();
     auto start = MonoTime::Now();
 
-    bool wait_result = finish_latch.WaitUntil(start + 10s);
+    bool wait_result = finish_latch.WaitUntil(start + 30s);
     auto stop = MonoTime::Now();
     auto passed = stop - start;
 
@@ -395,14 +418,14 @@ class QueuePerformanceHelper {
     }
   }
 
-  template <class T>
-  void TestQueue(const std::string& name) {
-    T queue;
-    DoTestQueue(name, &queue);
-  }
-
   template <class T, class... Args>
   void TestQueue(const std::string& name, Args&&... args) {
+    if (!name.empty() && !FLAGS_TEST_queue_name_regex.empty()) {
+      std::regex regex(FLAGS_TEST_queue_name_regex, std::regex::egrep);
+      if (!regex_match(name, regex)) {
+        return;
+      }
+    }
     T queue(std::forward<Args>(args)...);
     DoTestQueue(name, &queue);
   }
diff --git a/src/yb/util/lockfree.h b/src/yb/util/lockfree.h
@@ -21,6 +21,7 @@
 
 #include "yb/gutil/dynamic_annotations.h"
 #include "yb/gutil/macros.h"
+
 #include "yb/util/atomic.h" // For IsAcceptableAtomicImpl
 
 namespace yb {
@@ -112,6 +113,11 @@ class MPSCQueue {
     return push_head_.load(std::memory_order_acquire) == nullptr;
   }
 
+  void Clear() {
+    pop_head_ = nullptr;
+    push_head_.store(nullptr, std::memory_order_release);
+  }
+
   void Drain() {
     while (auto* entry = Pop()) {
       delete entry;
diff --git a/src/yb/util/locks.h b/src/yb/util/locks.h
@@ -329,4 +329,51 @@ class SemaphoreLock {
   Semaphore& semaphore_;
 };
 
+// simple_spinlock is something intermediate between spinlock and mutex, because it could fallback
+// waiting on futex. Also, it collects stats, etc.
+// TrivialSpinlock does not have such overhead and is more performant. For instance while testing
+// RWQueue in conjunction with it, the execution time was 1.5 times lower.
+class CAPABILITY("mutex") TrivialSpinlock {
+ public:
+  TrivialSpinlock() = default;
+
+  ~TrivialSpinlock() {
+    DCHECK(!is_locked());
+  }
+
+  void lock() ACQUIRE() {
+    size_t lock_counter = 16;
+    while (!try_lock()) {
+      while (is_locked()) {
+        // Max of 32752 pauses before we fallback to yield.
+        if (lock_counter <= 16 * 1024) {
+          for (size_t n = 0; n < lock_counter; ++n) {
+            base::subtle::PauseCPU();
+          }
+          lock_counter *= 2;
+        } else {
+          std::this_thread::yield();
+        }
+      }
+    }
+  }
+
+  void unlock() RELEASE() {
+    lockword_.store(false, std::memory_order_release);
+  }
+
+  bool try_lock() TRY_ACQUIRE(true) {
+    return !lockword_.exchange(true, std::memory_order_acquire);
+  }
+
+  bool is_locked() {
+    return lockword_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<bool> lockword_{false};
+
+  DISALLOW_COPY_AND_ASSIGN(TrivialSpinlock);
+};
+
 } // namespace yb