[yugabyte#4395] Prepared/replicated operation state ordering fix with a perf workaround for 64-core nodes

spolitov · spolitov · commit 9a4b4000cc90 · 2020-06-25T21:36:06.000+03:00
Summary: Commit f9f906a ("[yugabyte#4395] Fix SnapshotTxnTest.MultiWriteWithRestart flakiness") fixed an issue when a replicated operation could still be unprepared, and we were treating it as prepared. But it was causing a regression in a heavy write workload (MultiTableMultiIndexInserts in yb-sample-apps) on a 3-node cluster with 64-core i3.16xlarge nodes. So it was reverted by commit 1158e5f ("[yugabyte#4835] Revert operation driver logic changes causing a perf regression"). After an investigation it became clear that submitting an empty task to the preparer's thread pool helps avoid the performance regression. So in this diff we are restoring the above fix with this small addition that fixes the regression. Here is an explanation of why submitting an empty task improves performance. We are using a ThreadPool that uses a mutex when submitting a task, and many threads are trying to submit tasks to this thread pool. Waiting on this mutex sometimes takes a significant amount of time. So, when we submit an empty task, it works like a small delay. Since it is done while UpdateConsensus is being processed, it increases UpdateConsensus processing time at a follower. As a result the leader can accumulate a bigger UpdateConsensus request for the next call that result in fewer UpdateConsensus calls overall, i.e. better batching. This submission of an empty task to the thread pool is only acceptable as a temporary solution to the performance issue caused by the correctness fix. However, the prepared/replicated state ordering fix and performance of the MultiTableMultiIndexInserts workload on 64-core nodes are both important, so this diff implements the best solution available to us now. A better solution for the performance issue would be to implement a controlled UpdateConsensus delay that improves throughput. Test Plan: ybd tsan --gtest_filter SnapshotTxnTest.MultiWriteWithRestart -n 500 -- -p 2 Reviewers: mikhail Reviewed By: mikhail Subscribers: bogdan, ybase Differential Revision: https://phabricator.dev.yugabyte.com/D8719
diff --git a/src/yb/tablet/operations/operation_driver.cc b/src/yb/tablet/operations/operation_driver.cc
@@ -256,42 +256,15 @@ Status OperationDriver::PrepareAndStart() {
     // We can only do this after we've called Start()
     prepare_state_ = PREPARED;
 
-    // On the replica (non-leader) side, the replication state might have been REPLICATING during
-    // our previous acquisition of this lock, but it might have changed to REPLICATED in the
-    // meantime. That would mean ReplicationFinished got called, but ReplicationFinished would not
-    // trigger Apply unless the operation is PREPARED, so we are responsible for doing that.
-    // If we fail to capture the new replication state here, the operation will never be applied.
-    repl_state_copy = replication_state_;
+    if (replication_state_ == NOT_REPLICATING) {
+      replication_state_ = REPLICATING;
+    }
   }
 
-  switch (repl_state_copy) {
-    case NOT_REPLICATING:
-    {
-      {
-        std::lock_guard<simple_spinlock> lock(lock_);
-        replication_state_ = REPLICATING;
-      }
+  return Status::OK();
+}
 
-      // After the batching changes from 07/2017, It is the caller's responsibility to call
-      // Consensus::Replicate. See Preparer for details.
-      return Status::OK();
-    }
-    case REPLICATING:
-    {
-      // Already replicating - nothing to trigger
-      return Status::OK();
-    }
-    case REPLICATION_FAILED:
-      DCHECK(!operation_status_.ok());
-      FALLTHROUGH_INTENDED;
-    case REPLICATED:
-    {
-      // We can move on to apply.  Note that ApplyOperation() will handle the error status in the
-      // REPLICATION_FAILED case.
-      return ApplyOperation(yb::OpId::kUnknownTerm, nullptr /* applied_op_ids */);
-    }
-  }
-  FATAL_INVALID_ENUM_VALUE(ReplicationState, repl_state_copy);
+OperationDriver::~OperationDriver() {
 }
 
 void OperationDriver::ReplicationFailed(const Status& replication_status) {
@@ -373,11 +346,30 @@ void OperationDriver::ReplicationFinished(
   // Note that if we set the state to REPLICATION_FAILED above, ApplyOperation() will actually abort
   // the operation, i.e. ApplyTask() will never be called and the operation will never be applied to
   // the tablet.
-  if (prepare_state_copy == PREPARED) {
-    // We likely need to do cleanup if this fails so for now just
-    // CHECK_OK
-    CHECK_OK(ApplyOperation(leader_term, applied_op_ids));
+  if (prepare_state_copy != PrepareState::PREPARED) {
+    LOG(DFATAL) << "Replicating an operation that has not been prepared: " << AsString(this);
+
+    LOG(ERROR) << "Attempting to wait for the operation to be prepared";
+
+    // This case should never happen, but if it happens we are trying to survive.
+    for (;;) {
+      std::this_thread::sleep_for(1ms);
+      PrepareState prepare_state;
+      {
+        std::lock_guard<simple_spinlock> lock(lock_);
+        prepare_state = prepare_state_;
+        if (prepare_state == PrepareState::PREPARED) {
+          break;
+        }
+      }
+      YB_LOG_EVERY_N_SECS(WARNING, 1)
+          << "Waiting for the operation to be prepared, current state: " << prepare_state;
+    }
   }
+
+  // We likely need to do cleanup if this fails so for now just
+  // CHECK_OK
+  CHECK_OK(ApplyOperation(leader_term, applied_op_ids));
 }
 
 void OperationDriver::Abort(const Status& status) {
diff --git a/src/yb/tablet/operations/operation_driver.h b/src/yb/tablet/operations/operation_driver.h
@@ -231,7 +231,7 @@ class OperationDriver : public RefCountedThreadSafe<OperationDriver>,
     PREPARED
   };
 
-  ~OperationDriver() override {}
+  ~OperationDriver();
 
   // Starts operation, returns false is we should NOT continue processing the operation.
   bool StartOperation();
diff --git a/src/yb/tablet/preparer.cc b/src/yb/tablet/preparer.cc
@@ -31,6 +31,7 @@
 DEFINE_int32(max_group_replicate_batch_size, 16,
              "Maximum number of operations to submit to consensus for replication in a batch.");
 
+using namespace std::literals;
 using std::vector;
 
 namespace yb {
@@ -139,8 +140,19 @@ Status PreparerImpl::Submit(OperationDriver* operation_driver) {
     return STATUS(IllegalState, "Tablet is shutting down");
   }
 
-  active_tasks_.fetch_add(1, std::memory_order_release);
-  queue_.Push(operation_driver);
+  if (!operation_driver->is_leader_side()) {
+    while (active_tasks_.load(std::memory_order_acquire) != 0) {
+      YB_LOG_EVERY_N_SECS(WARNING, 1)
+          << "Waiting for active tasks to become zero: "
+          << active_tasks_.load(std::memory_order_acquire);
+      // It should be very rare case, so could do busy wait.
+      std::this_thread::sleep_for(1ms);
+    }
+    operation_driver->PrepareAndStartTask();
+  } else {
+    active_tasks_.fetch_add(1, std::memory_order_release);
+    queue_.Push(operation_driver);
+  }
 
   auto expected = false;
   if (!running_.compare_exchange_strong(expected, true, std::memory_order_acq_rel)) {
@@ -213,29 +225,24 @@ bool ShouldApplySeparately(OperationType operation_type) {
 void PreparerImpl::ProcessItem(OperationDriver* item) {
   CHECK_NOTNULL(item);
 
-  if (item->is_leader_side()) {
-    auto operation_type = item->operation_type();
+  LOG_IF(DFATAL, !item->is_leader_side()) << "Processing follower-side item";
 
-    const bool apply_separately = ShouldApplySeparately(operation_type);
-    const int64_t bound_term = apply_separately ? -1 : item->consensus_round()->bound_term();
+  auto operation_type = item->operation_type();
 
-    // Don't add more than the max number of operations to a batch, and also don't add
-    // operations bound to different terms, so as not to fail unrelated operations
-    // unnecessarily in case of a bound term mismatch.
-    if (leader_side_batch_.size() >= FLAGS_max_group_replicate_batch_size ||
-        (!leader_side_batch_.empty() &&
-            bound_term != leader_side_batch_.back()->consensus_round()->bound_term())) {
-      ProcessAndClearLeaderSideBatch();
-    }
-    leader_side_batch_.push_back(item);
-    if (apply_separately) {
-      ProcessAndClearLeaderSideBatch();
-    }
-  } else {
-    // We found a non-leader-side operation. We need to process the accumulated batch of
-    // leader-side operations first, and then process this other operation.
+  const bool apply_separately = ShouldApplySeparately(operation_type);
+  const int64_t bound_term = apply_separately ? -1 : item->consensus_round()->bound_term();
+
+  // Don't add more than the max number of operations to a batch, and also don't add
+  // operations bound to different terms, so as not to fail unrelated operations
+  // unnecessarily in case of a bound term mismatch.
+  if (leader_side_batch_.size() >= FLAGS_max_group_replicate_batch_size ||
+      (!leader_side_batch_.empty() &&
+          bound_term != leader_side_batch_.back()->consensus_round()->bound_term())) {
+    ProcessAndClearLeaderSideBatch();
+  }
+  leader_side_batch_.push_back(item);
+  if (apply_separately) {
     ProcessAndClearLeaderSideBatch();
-    item->PrepareAndStartTask();
   }
 }