iSignal
diff --git a/‎src/yb/integration-tests/compaction-test.cc‎
Lines changed: 169 additions & 11 deletions b/‎src/yb/integration-tests/compaction-test.cc‎
Lines changed: 169 additions & 11 deletions
diff --git a/‎src/yb/integration-tests/mini_cluster.cc‎
Lines changed: 73 additions & 0 deletions b/‎src/yb/integration-tests/mini_cluster.cc‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎src/yb/integration-tests/mini_cluster.h‎
Lines changed: 12 additions & 0 deletions b/‎src/yb/integration-tests/mini_cluster.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/yb/rocksdb/db/db_impl.cc‎
Lines changed: 7 additions & 2 deletions b/‎src/yb/rocksdb/db/db_impl.cc‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/yb/util/shmem/robust_hash_map.h‎
Lines changed: 1 addition & 1 deletion b/‎src/yb/util/shmem/robust_hash_map.h‎
Lines changed: 1 addition & 1 deletion
@@ -94,15 +94,20 @@ DECLARE_bool(TEST_disable_adding_last_compaction_to_tablet_metadata);
 DECLARE_bool(TEST_disable_adding_user_frontier_to_sst);
 DECLARE_bool(TEST_disable_getting_user_frontier_from_mem_table);
 DECLARE_bool(TEST_pause_before_full_compaction);
+DECLARE_bool(enable_ondisk_compression);
 DECLARE_bool(enable_load_balancing);
 DECLARE_bool(file_expiration_ignore_value_ttl);
 DECLARE_bool(file_expiration_value_ttl_overrides_table_ttl);
+DECLARE_bool(rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool);
+DECLARE_bool(rocksdb_determine_compaction_input_at_start);
 DECLARE_bool(tablet_enable_ttl_file_filter);
 DECLARE_bool(use_priority_thread_pool_for_compactions);
+DECLARE_bool(ycql_enable_packed_row);
 
 DECLARE_double(auto_compact_percent_obsolete);
 
 DECLARE_int32(auto_compact_check_interval_sec);
+DECLARE_int32(cleanup_split_tablets_interval_sec);
 DECLARE_int32(full_compaction_pool_max_queue_size);
 DECLARE_int32(full_compaction_pool_max_threads);
 DECLARE_int32(priority_thread_pool_size);
@@ -122,13 +127,12 @@ DECLARE_int64(rocksdb_compact_flush_rate_limit_bytes_per_sec);
 DECLARE_uint32(auto_compact_min_obsolete_keys_found);
 DECLARE_uint32(auto_compact_stat_window_seconds);
 
+DECLARE_uint64(post_split_compaction_input_size_threshold_bytes);
 DECLARE_uint64(rocksdb_max_file_size_for_compaction);
 
 DECLARE_string(allow_compaction_failures_for_tablet_ids);
 
-namespace yb {
-
-namespace tserver {
+namespace yb::tserver {
 
 namespace {
 
@@ -206,6 +210,8 @@ class CompactionTest : public YBTest {
 
     ANNOTATE_UNPROTECTED_WRITE(FLAGS_priority_thread_pool_size) = 2;
 
+    ANNOTATE_UNPROTECTED_WRITE(FLAGS_cleanup_split_tablets_interval_sec) = 1;
+
     // Disable scheduled compactions by default so we don't have surprise compactions.
     ANNOTATE_UNPROTECTED_WRITE(FLAGS_scheduled_full_compaction_frequency_hours) = 0;
     ANNOTATE_UNPROTECTED_WRITE(FLAGS_scheduled_full_compaction_jitter_factor_percentage) = 0;
@@ -219,12 +225,8 @@ class CompactionTest : public YBTest {
     // These flags should be set after minicluster start, so it wouldn't override them.
     ANNOTATE_UNPROTECTED_WRITE(FLAGS_db_write_buffer_size) = kMemStoreSize;
     ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_level0_file_num_compaction_trigger) = 3;
-    // Patch tablet options inside tablet manager, will be applied to newly created tablets.
-    for (int i = 0 ; i < NumTabletServers(); i++) {
-      ANNOTATE_IGNORE_WRITES_BEGIN();
-      cluster_->GetTabletManager(i)->TEST_tablet_options()->listeners.push_back(rocksdb_listener_);
-      ANNOTATE_IGNORE_WRITES_END();
-    }
+
+    AddRocksDBListener(rocksdb_listener_);
 
     client_ = ASSERT_RESULT(cluster_->CreateClient());
     transaction_manager_ = std::make_unique<client::TransactionManager>(
@@ -242,6 +244,15 @@ class CompactionTest : public YBTest {
     YBTest::TearDown();
   }
 
+  void AddRocksDBListener(std::shared_ptr<rocksdb::EventListener> listener) {
+    // Patch tablet options inside tablet manager, will be applied to newly created tablets.
+    for (int i = 0 ; i < NumTabletServers(); i++) {
+      ANNOTATE_IGNORE_WRITES_BEGIN();
+      cluster_->GetTabletManager(i)->TEST_tablet_options()->listeners.push_back(listener);
+      ANNOTATE_IGNORE_WRITES_END();
+    }
+  }
+
   void SetupWorkload(IsolationLevel isolation_level, int num_tablets = kDefaultNumTablets) {
     workload_.reset(new TestWorkload(cluster_.get()));
     workload_->set_timeout_allowed(true);
@@ -1846,6 +1857,154 @@ TEST_F(CompactionTest, CheckLastRequestTimePersistence) {
   ASSERT_GT(table_info->LockForRead()->pb.last_full_compaction_request_time(), last_request_time);
 }
 
+// Covers https://github.com/yugabyte/yugabyte-db/issues/27426. Refer to D44394 for the description.
+TEST_F(CompactionTest, BackgroundCompactionDuringPostSplitCompaction) {
+  constexpr size_t kNumTablets = 1;
+  constexpr size_t kNumFiles = 9;
+  constexpr size_t kTrigger = kNumFiles - 2;
+  constexpr uint64_t kSstFileSize = 500_KB;
+  constexpr uint64_t kThreshold = kSstFileSize * 0.80;
+
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_ycql_enable_packed_row) = true;
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_ondisk_compression) = false;
+
+  // Configuring flags to guarantee a background compaction will kick in between post split
+  // compaction iterations.
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_db_write_buffer_size) = kSstFileSize;
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_max_file_size_for_compaction) = kThreshold;
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_level0_file_num_compaction_trigger) = kTrigger;
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_post_split_compaction_input_size_threshold_bytes) = kThreshold;
+
+  // Configuring flags to guarantee background compaction picks SST files at the end of post split
+  // compaction and keeps them locked till the compaction is finished.
+  ANNOTATE_UNPROTECTED_WRITE(FLAGS_rocksdb_determine_compaction_input_at_start) = false;
+  ANNOTATE_UNPROTECTED_WRITE(
+      FLAGS_rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool) = true;
+
+  // Sanity checks for minimal requirements.
+  ASSERT_GT(ANNOTATE_UNPROTECTED_READ(FLAGS_full_compaction_pool_max_threads), 0);
+  ASSERT_GT(ANNOTATE_UNPROTECTED_READ(FLAGS_priority_thread_pool_size), 1);
+
+  // Helpers to extract files information.
+  auto files_ids = [](auto&& files) {
+    return AsString(files, [](auto&& file) { return file.name_id; });
+  };
+  auto max_file_id = [](auto&& files) {
+    return std::ranges::max_element(files, {}, &rocksdb::LiveFileMetaData::name_id)->name_id;
+  };
+  auto min_file_id = [](auto&& files) {
+    return std::ranges::min_element(files, {}, &rocksdb::LiveFileMetaData::name_id)->name_id;
+  };
+
+  // Additional RocksDB listener to guarantee compaction flow.
+  struct DBListener : public rocksdb::EventListener {
+    bool background_compaction_in_progress = false;
+    size_t num_post_split_iterations = 0;
+    std::mutex mutex;
+    std::condition_variable_any compaction_started_cv;
+
+    void OnCompactionStarted() override {
+      UniqueLock lock(mutex);
+
+      // Background compaction will be always
+      if (num_post_split_iterations == kTrigger && !background_compaction_in_progress) {
+        LOG(INFO) << "Background compaction started";
+        background_compaction_in_progress = true;
+
+        // Wait for the next post split compaction iteration got triggered or exit on timeout.
+        compaction_started_cv.wait_for(
+            lock, std::chrono::seconds(30),
+            [this] { return num_post_split_iterations != kTrigger; });
+      } else {
+        ++num_post_split_iterations;
+        compaction_started_cv.notify_all();
+      }
+    }
+
+    void OnCompactionCompleted(rocksdb::DB* db, const rocksdb::CompactionJobInfo& info) override {
+      LOG(INFO) << "Compaction completed, reason: " << info.compaction_reason;
+
+      std::lock_guard lock(mutex);
+      if (info.is_no_op_compaction) {
+        // Sanity check, the only no-op compaction is the post split compaction final iteration.
+        ASSERT_EQ(info.compaction_reason, rocksdb::CompactionReason::kPostSplitCompaction);
+
+        LOG(INFO) << "Number of post split compaction iterations: " << num_post_split_iterations;
+        num_post_split_iterations = 0; // Resetting to track compactions for the next child.
+
+        // This no op post split compaction iteration happens in any case, let's unblock
+        // background compaction to complete it.
+        compaction_started_cv.notify_all();
+      } else if (info.compaction_reason != rocksdb::CompactionReason::kPostSplitCompaction) {
+        background_compaction_in_progress = false;
+        EXPECT_EQ(info.compaction_reason, rocksdb::CompactionReason::kUniversalSizeAmplification);
+        LOG(INFO) << "Background compaction done";
+      }
+    }
+  };
+  auto listener = std::make_shared<DBListener>();
+  AddRocksDBListener(listener);
+
+  SetupWorkload(IsolationLevel::NON_TRANSACTIONAL, kNumTablets);
+
+  // Change the table to have a default time to live. This is required for the easiest reproing,
+  // but the issue may happen even without default TTL.
+  ASSERT_OK(ChangeTableTTL(workload_->table_name(), /* ttl_sec = */ 1000));
+  ASSERT_OK(WriteAtLeastFilesPerDb(kNumFiles));
+
+  // Flush mem tables to have the predictable number of SST files.
+  const auto table_info = ASSERT_RESULT(FindTable(cluster_.get(), workload_->table_name()));
+  ASSERT_OK(workload_->client().FlushTables(
+      {table_info->id()}, /* add_indexes = */ false,
+      /* timeout_secs = */ 60, /* is_compaction = */ false));
+
+  // Remember parent files before split.
+  auto dbs = GetAllRocksDbs(cluster_.get(), /* include_intents = */ false);
+  ASSERT_EQ(dbs.size(), 1);
+
+  uint64_t parent_max_file_id = 0;
+  {
+    const auto files = dbs.front()->GetLiveFilesMetaData();
+    parent_max_file_id = max_file_id(files);
+    LOG(INFO) << "Parent files: " << files_ids(files);
+  }
+
+  // Trigger manual tablet split.
+  auto peers = ListTabletPeers(cluster_.get(), ListPeersFilter::kAll);
+  ASSERT_EQ(peers.size(), kNumTablets);
+  const auto tablet = ASSERT_RESULT(peers.front()->shared_tablet_safe());
+  ASSERT_OK(InvokeSplitTabletRpcAndWaitForDataCompacted(cluster_.get(), tablet->tablet_id()));
+
+  // Wait until parent tablet got cleaned up.
+  ASSERT_OK(LoggedWaitFor(
+      [cluster = cluster_.get()]{
+        return ListTabletPeers(cluster, ListPeersFilter::kAll).size() == 2;
+      }, 60s, "Parent tablet cleanup"));
+
+  // Total number of compactions equals to a sum of number of post split compaction iterations and
+  // one background compaction. Number of post split compaction iterations equals to the number of
+  // parent files plus one empty iteration to indicate post split compaction completion.
+  constexpr size_t kNumParentFiles = kNumFiles + 1; // One more file due to an explicit flush.
+  constexpr size_t kNumPostSplitCompactionIterations = kNumParentFiles + 1;
+  constexpr size_t kNumBackgroundCompactions = 1;
+  constexpr size_t kNumExpectedCompactions =
+      kNumPostSplitCompactionIterations + kNumBackgroundCompactions;
+
+  // Postpone status check for logging children files.
+  auto status = WaitForNumCompactionsPerDb(kNumExpectedCompactions);
+
+  // Make sure child tablets do not have parent files.
+  dbs = GetAllRocksDbs(cluster_.get(), /* include_intents = */ false);
+  ASSERT_EQ(dbs.size(), 2);
+  for (auto* db : dbs) {
+    const auto files = db->GetLiveFilesMetaData();
+    LOG(INFO) << "Child files: " << files_ids(files);
+    ASSERT_LT(parent_max_file_id, min_file_id(files));
+  }
+
+  ASSERT_OK(status);
+}
+
 class FullCompactionMonitoringTest : public CompactionTest {
  protected:
   void SetUp() override {
@@ -2183,5 +2342,4 @@ TEST_F(CompactionTest, RemoveCorruptDataBlocks) {
   ASSERT_LE(num_keys_lost, num_max_corrupt_keys_estimate);
 }
 
-} // namespace tserver
-} // namespace yb
+} // namespace yb::tserver
@@ -47,11 +47,14 @@
 #include "yb/gutil/strings/join.h"
 #include "yb/gutil/strings/substitute.h"
 
+#include "yb/integration-tests/cluster_itest_util.h"
+
 #include "yb/master/catalog_entity_info.h"
 #include "yb/master/catalog_manager_if.h"
 #include "yb/master/catalog_manager.h"
 #include "yb/master/master.h"
 #include "yb/master/master_admin.pb.h"
+#include "yb/master/master_admin.proxy.h"
 #include "yb/master/master_client.pb.h"
 #include "yb/master/master_cluster.pb.h"
 #include "yb/master/master_ddl.pb.h"
@@ -188,6 +191,12 @@ bool IsForTable(const tablet::TabletPeer& peer, const TableId& table_id) {
   return false;
 }
 
+bool IsTabletInCollection(const master::TabletInfoPtr& tablet, const master::TabletInfos& tablets) {
+  return tablets.end() != std::find_if(
+      tablets.begin(), tablets.end(),
+      [&tablet](const master::TabletInfoPtr& p) { return p->tablet_id() == tablet->tablet_id(); });
+}
+
 } // namespace
 
 MiniCluster::MiniCluster(const MiniClusterOptions& options)
@@ -1547,6 +1556,70 @@ void SetCompactFlushRateLimitBytesPerSec(MiniCluster* cluster, const size_t byte
   }
 }
 
+Status InvokeSplitTabletRpc(MiniCluster* cluster, const TabletId& tablet_id, MonoDelta timeout) {
+  auto& master = *VERIFY_RESULT(cluster->GetLeaderMiniMaster());
+  auto  proxy = master::MasterAdminProxy(&cluster->proxy_cache(), master.bound_rpc_addr());
+
+  master::SplitTabletRequestPB req;
+  req.set_tablet_id(tablet_id);
+
+  rpc::RpcController controller;
+  controller.set_timeout(timeout);
+  master::SplitTabletResponsePB resp;
+  RETURN_NOT_OK(proxy.SplitTablet(req, &resp, &controller));
+  if (resp.has_error()) {
+    RETURN_NOT_OK(StatusFromPB(resp.error().status()));
+  }
+  return Status::OK();
+}
+
+Status InvokeSplitTabletRpcAndWaitForDataCompacted(
+    MiniCluster* cluster, const master::TableInfoPtr& table,
+    const master::TabletInfoPtr& tablet, MonoDelta rpc_timeout) {
+  // Keep current tablets.
+  const auto tablets = VERIFY_RESULT(table->GetTablets());
+
+  // Sanity check that tablet belongs to the table.
+  if (!IsTabletInCollection(tablet, tablets)) {
+    return STATUS(InvalidArgument, "The tablet does not belong to table's tablets list.");
+  }
+
+  // Send split RPC.
+  RETURN_NOT_OK(InvokeSplitTabletRpc(cluster, tablet->tablet_id(), rpc_timeout));
+
+  // Wait for new tablets are added.
+  RETURN_NOT_OK(WaitForTableActiveTabletLeadersPeers(cluster, table->id(), tablets.size() + 1));
+
+  // Wait until split is replicated across all tablet servers.
+  RETURN_NOT_OK(WaitAllReplicasReady(
+      cluster, table->id(), MonoDelta::FromSeconds(20) * kTimeMultiplier));
+
+  // Select new tablets ids
+  const auto all_tablets = VERIFY_RESULT(table->GetTablets());
+  std::vector<TabletId> new_tablet_ids;
+  new_tablet_ids.reserve(all_tablets.size());
+  for (const auto& t : all_tablets) {
+    if (!IsTabletInCollection(t, tablets)) {
+      new_tablet_ids.push_back(t->tablet_id());
+    }
+  }
+
+  // Wait for new peers are fully compacted.
+  return WaitForPeersPostSplitCompacted(cluster, new_tablet_ids);
+}
+
+Status InvokeSplitTabletRpcAndWaitForDataCompacted(
+    MiniCluster* cluster, const TabletId& tablet_id, MonoDelta rpc_timeout) {
+  auto* master = VERIFY_RESULT(cluster->GetLeaderMiniMaster());
+  auto& catalog_manager = master->catalog_manager();
+  const auto tablet = VERIFY_RESULT(catalog_manager.GetTabletInfo(tablet_id));
+
+  // Get current number of tablets for the table.
+  const auto table = catalog_manager.GetTableInfo(tablet->table()->id());
+
+  return InvokeSplitTabletRpcAndWaitForDataCompacted(cluster, table, tablet, rpc_timeout);
+}
+
 Status WaitAllReplicasSynchronizedWithLeader(
     MiniCluster* cluster, CoarseTimePoint deadline) {
   auto leaders = ListTabletPeers(cluster, ListPeersFilter::kLeaders);
 
@@ -480,6 +480,18 @@ Result<size_t> ServerWithLeaders(MiniCluster* cluster);
 // for already created tablets.
 void SetCompactFlushRateLimitBytesPerSec(MiniCluster* cluster, size_t bytes_per_sec);
 
+Status InvokeSplitTabletRpc(
+    MiniCluster* cluster, const TabletId& tablet_id,
+    MonoDelta timeout = MonoDelta::FromSeconds(60) * kTimeMultiplier);
+
+Status InvokeSplitTabletRpcAndWaitForDataCompacted(
+    MiniCluster* cluster, const master::TableInfoPtr& table, const master::TabletInfoPtr& tablet,
+    MonoDelta rpc_timeout = MonoDelta::FromSeconds(60) * kTimeMultiplier);
+
+Status InvokeSplitTabletRpcAndWaitForDataCompacted(
+    MiniCluster* cluster, const TabletId& tablet_id,
+    MonoDelta rpc_timeout = MonoDelta::FromSeconds(60) * kTimeMultiplier);
+
 Status WaitAllReplicasSynchronizedWithLeader(MiniCluster* cluster, CoarseTimePoint deadline);
 
 Status WaitAllReplicasSynchronizedWithLeader(MiniCluster* cluster, CoarseDuration timeout);
 
@@ -3910,8 +3910,13 @@ Result<FileNumbersHolder> DBImpl::BackgroundCompaction(
   }
 
   Result<FileNumbersHolder> result = FileNumbersHolder();
-  for (auto listener : db_options_.listeners) {
-    listener->OnCompactionStarted();
+
+  {
+    mutex_.Unlock();
+    for (auto listener : db_options_.listeners) {
+      listener->OnCompactionStarted();
+    }
+    mutex_.Lock();
   }
 
   if (!c) {
 
@@ -414,7 +414,7 @@ class RobustHashMap {
     SHARED_MEMORY_STORE(in_progress_node_, nullptr);
   }
 
-  void DoDelete(List::const_iterator prev_itr) {
+  void DoDelete(typename List::const_iterator prev_itr) {
     auto* prev = &*prev_itr.unconst();
     auto* node = &*std::next(prev_itr).unconst();
     SHARED_MEMORY_STORE(in_progress_num_elements_, SHARED_MEMORY_LOAD(num_elements_) - 1);
Original file line number	Diff line number	Diff line change
`@@ -3910,8 +3910,13 @@ Result<FileNumbersHolder> DBImpl::BackgroundCompaction(`
`3910`	`3910`	`}`
`3911`	`3911`
`3912`	`3912`	`Result<FileNumbersHolder> result = FileNumbersHolder();`
`3913`		`- for (auto listener : db_options_.listeners) {`
`3914`		`- listener->OnCompactionStarted();`
	`3913`	`+`
	`3914`	`+ {`
	`3915`	`+ mutex_.Unlock();`
	`3916`	`+ for (auto listener : db_options_.listeners) {`
	`3917`	`+ listener->OnCompactionStarted();`
	`3918`	`+ }`
	`3919`	`+ mutex_.Lock();`
`3915`	`3920`	`}`
`3916`	`3921`
`3917`	`3922`	`if (!c) {`
Original file line number	Diff line number	Diff line change
`@@ -414,7 +414,7 @@ class RobustHashMap {`
`414`	`414`	`SHARED_MEMORY_STORE(in_progress_node_, nullptr);`
`415`	`415`	`}`
`416`	`416`
`417`		`- void DoDelete(List::const_iterator prev_itr) {`
	`417`	`+ void DoDelete(typename List::const_iterator prev_itr) {`
`418`	`418`	`auto* prev = &*prev_itr.unconst();`
`419`	`419`	`auto* node = &*std::next(prev_itr).unconst();`
`420`	`420`	`SHARED_MEMORY_STORE(in_progress_num_elements_, SHARED_MEMORY_LOAD(num_elements_) - 1);`