@@ -94,15 +94,20 @@ DECLARE_bool(TEST_disable_adding_last_compaction_to_tablet_metadata);
9494DECLARE_bool (TEST_disable_adding_user_frontier_to_sst);
9595DECLARE_bool (TEST_disable_getting_user_frontier_from_mem_table);
9696DECLARE_bool (TEST_pause_before_full_compaction);
97+ DECLARE_bool (enable_ondisk_compression);
9798DECLARE_bool (enable_load_balancing);
9899DECLARE_bool (file_expiration_ignore_value_ttl);
99100DECLARE_bool (file_expiration_value_ttl_overrides_table_ttl);
101+ DECLARE_bool (rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool);
102+ DECLARE_bool (rocksdb_determine_compaction_input_at_start);
100103DECLARE_bool (tablet_enable_ttl_file_filter);
101104DECLARE_bool (use_priority_thread_pool_for_compactions);
105+ DECLARE_bool (ycql_enable_packed_row);
102106
103107DECLARE_double (auto_compact_percent_obsolete);
104108
105109DECLARE_int32 (auto_compact_check_interval_sec);
110+ DECLARE_int32 (cleanup_split_tablets_interval_sec);
106111DECLARE_int32 (full_compaction_pool_max_queue_size);
107112DECLARE_int32 (full_compaction_pool_max_threads);
108113DECLARE_int32 (priority_thread_pool_size);
@@ -122,13 +127,12 @@ DECLARE_int64(rocksdb_compact_flush_rate_limit_bytes_per_sec);
122127DECLARE_uint32 (auto_compact_min_obsolete_keys_found);
123128DECLARE_uint32 (auto_compact_stat_window_seconds);
124129
130+ DECLARE_uint64 (post_split_compaction_input_size_threshold_bytes);
125131DECLARE_uint64 (rocksdb_max_file_size_for_compaction);
126132
127133DECLARE_string (allow_compaction_failures_for_tablet_ids);
128134
129- namespace yb {
130-
131- namespace tserver {
135+ namespace yb ::tserver {
132136
133137namespace {
134138
@@ -206,6 +210,8 @@ class CompactionTest : public YBTest {
206210
207211 ANNOTATE_UNPROTECTED_WRITE (FLAGS_priority_thread_pool_size) = 2 ;
208212
213+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_cleanup_split_tablets_interval_sec) = 1 ;
214+
209215 // Disable scheduled compactions by default so we don't have surprise compactions.
210216 ANNOTATE_UNPROTECTED_WRITE (FLAGS_scheduled_full_compaction_frequency_hours) = 0 ;
211217 ANNOTATE_UNPROTECTED_WRITE (FLAGS_scheduled_full_compaction_jitter_factor_percentage) = 0 ;
@@ -219,12 +225,8 @@ class CompactionTest : public YBTest {
219225 // These flags should be set after minicluster start, so it wouldn't override them.
220226 ANNOTATE_UNPROTECTED_WRITE (FLAGS_db_write_buffer_size) = kMemStoreSize ;
221227 ANNOTATE_UNPROTECTED_WRITE (FLAGS_rocksdb_level0_file_num_compaction_trigger) = 3 ;
222- // Patch tablet options inside tablet manager, will be applied to newly created tablets.
223- for (int i = 0 ; i < NumTabletServers (); i++) {
224- ANNOTATE_IGNORE_WRITES_BEGIN ();
225- cluster_->GetTabletManager (i)->TEST_tablet_options ()->listeners .push_back (rocksdb_listener_);
226- ANNOTATE_IGNORE_WRITES_END ();
227- }
228+
229+ AddRocksDBListener (rocksdb_listener_);
228230
229231 client_ = ASSERT_RESULT (cluster_->CreateClient ());
230232 transaction_manager_ = std::make_unique<client::TransactionManager>(
@@ -242,6 +244,15 @@ class CompactionTest : public YBTest {
242244 YBTest::TearDown ();
243245 }
244246
247+ void AddRocksDBListener (std::shared_ptr<rocksdb::EventListener> listener) {
248+ // Patch tablet options inside tablet manager, will be applied to newly created tablets.
249+ for (int i = 0 ; i < NumTabletServers (); i++) {
250+ ANNOTATE_IGNORE_WRITES_BEGIN ();
251+ cluster_->GetTabletManager (i)->TEST_tablet_options ()->listeners .push_back (listener);
252+ ANNOTATE_IGNORE_WRITES_END ();
253+ }
254+ }
255+
245256 void SetupWorkload (IsolationLevel isolation_level, int num_tablets = kDefaultNumTablets ) {
246257 workload_.reset (new TestWorkload (cluster_.get ()));
247258 workload_->set_timeout_allowed (true );
@@ -1846,6 +1857,154 @@ TEST_F(CompactionTest, CheckLastRequestTimePersistence) {
18461857 ASSERT_GT (table_info->LockForRead ()->pb .last_full_compaction_request_time (), last_request_time);
18471858}
18481859
1860+ // Covers https://github.com/yugabyte/yugabyte-db/issues/27426. Refer to D44394 for the description.
1861+ TEST_F (CompactionTest, BackgroundCompactionDuringPostSplitCompaction) {
1862+ constexpr size_t kNumTablets = 1 ;
1863+ constexpr size_t kNumFiles = 9 ;
1864+ constexpr size_t kTrigger = kNumFiles - 2 ;
1865+ constexpr uint64_t kSstFileSize = 500_KB;
1866+ constexpr uint64_t kThreshold = kSstFileSize * 0.80 ;
1867+
1868+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_ycql_enable_packed_row) = true ;
1869+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_enable_ondisk_compression) = false ;
1870+
1871+ // Configuring flags to guarantee a background compaction will kick in between post split
1872+ // compaction iterations.
1873+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_db_write_buffer_size) = kSstFileSize ;
1874+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_rocksdb_max_file_size_for_compaction) = kThreshold ;
1875+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_rocksdb_level0_file_num_compaction_trigger) = kTrigger ;
1876+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_post_split_compaction_input_size_threshold_bytes) = kThreshold ;
1877+
1878+ // Configuring flags to guarantee background compaction picks SST files at the end of post split
1879+ // compaction and keeps them locked till the compaction is finished.
1880+ ANNOTATE_UNPROTECTED_WRITE (FLAGS_rocksdb_determine_compaction_input_at_start) = false ;
1881+ ANNOTATE_UNPROTECTED_WRITE (
1882+ FLAGS_rocksdb_allow_multiple_pending_compactions_for_priority_thread_pool) = true ;
1883+
1884+ // Sanity checks for minimal requirements.
1885+ ASSERT_GT (ANNOTATE_UNPROTECTED_READ (FLAGS_full_compaction_pool_max_threads), 0 );
1886+ ASSERT_GT (ANNOTATE_UNPROTECTED_READ (FLAGS_priority_thread_pool_size), 1 );
1887+
1888+ // Helpers to extract files information.
1889+ auto files_ids = [](auto && files) {
1890+ return AsString (files, [](auto && file) { return file.name_id ; });
1891+ };
1892+ auto max_file_id = [](auto && files) {
1893+ return std::ranges::max_element (files, {}, &rocksdb::LiveFileMetaData::name_id)->name_id ;
1894+ };
1895+ auto min_file_id = [](auto && files) {
1896+ return std::ranges::min_element (files, {}, &rocksdb::LiveFileMetaData::name_id)->name_id ;
1897+ };
1898+
1899+ // Additional RocksDB listener to guarantee compaction flow.
1900+ struct DBListener : public rocksdb ::EventListener {
1901+ bool background_compaction_in_progress = false ;
1902+ size_t num_post_split_iterations = 0 ;
1903+ std::mutex mutex;
1904+ std::condition_variable_any compaction_started_cv;
1905+
1906+ void OnCompactionStarted () override {
1907+ UniqueLock lock (mutex);
1908+
1909+ // Background compaction will be always
1910+ if (num_post_split_iterations == kTrigger && !background_compaction_in_progress) {
1911+ LOG (INFO) << " Background compaction started" ;
1912+ background_compaction_in_progress = true ;
1913+
1914+ // Wait for the next post split compaction iteration got triggered or exit on timeout.
1915+ compaction_started_cv.wait_for (
1916+ lock, std::chrono::seconds (30 ),
1917+ [this ] { return num_post_split_iterations != kTrigger ; });
1918+ } else {
1919+ ++num_post_split_iterations;
1920+ compaction_started_cv.notify_all ();
1921+ }
1922+ }
1923+
1924+ void OnCompactionCompleted (rocksdb::DB* db, const rocksdb::CompactionJobInfo& info) override {
1925+ LOG (INFO) << " Compaction completed, reason: " << info.compaction_reason ;
1926+
1927+ std::lock_guard lock (mutex);
1928+ if (info.is_no_op_compaction ) {
1929+ // Sanity check, the only no-op compaction is the post split compaction final iteration.
1930+ ASSERT_EQ (info.compaction_reason , rocksdb::CompactionReason::kPostSplitCompaction );
1931+
1932+ LOG (INFO) << " Number of post split compaction iterations: " << num_post_split_iterations;
1933+ num_post_split_iterations = 0 ; // Resetting to track compactions for the next child.
1934+
1935+ // This no op post split compaction iteration happens in any case, let's unblock
1936+ // background compaction to complete it.
1937+ compaction_started_cv.notify_all ();
1938+ } else if (info.compaction_reason != rocksdb::CompactionReason::kPostSplitCompaction ) {
1939+ background_compaction_in_progress = false ;
1940+ EXPECT_EQ (info.compaction_reason , rocksdb::CompactionReason::kUniversalSizeAmplification );
1941+ LOG (INFO) << " Background compaction done" ;
1942+ }
1943+ }
1944+ };
1945+ auto listener = std::make_shared<DBListener>();
1946+ AddRocksDBListener (listener);
1947+
1948+ SetupWorkload (IsolationLevel::NON_TRANSACTIONAL, kNumTablets );
1949+
1950+ // Change the table to have a default time to live. This is required for the easiest reproing,
1951+ // but the issue may happen even without default TTL.
1952+ ASSERT_OK (ChangeTableTTL (workload_->table_name (), /* ttl_sec = */ 1000 ));
1953+ ASSERT_OK (WriteAtLeastFilesPerDb (kNumFiles ));
1954+
1955+ // Flush mem tables to have the predictable number of SST files.
1956+ const auto table_info = ASSERT_RESULT (FindTable (cluster_.get (), workload_->table_name ()));
1957+ ASSERT_OK (workload_->client ().FlushTables (
1958+ {table_info->id ()}, /* add_indexes = */ false ,
1959+ /* timeout_secs = */ 60 , /* is_compaction = */ false ));
1960+
1961+ // Remember parent files before split.
1962+ auto dbs = GetAllRocksDbs (cluster_.get (), /* include_intents = */ false );
1963+ ASSERT_EQ (dbs.size (), 1 );
1964+
1965+ uint64_t parent_max_file_id = 0 ;
1966+ {
1967+ const auto files = dbs.front ()->GetLiveFilesMetaData ();
1968+ parent_max_file_id = max_file_id (files);
1969+ LOG (INFO) << " Parent files: " << files_ids (files);
1970+ }
1971+
1972+ // Trigger manual tablet split.
1973+ auto peers = ListTabletPeers (cluster_.get (), ListPeersFilter::kAll );
1974+ ASSERT_EQ (peers.size (), kNumTablets );
1975+ const auto tablet = ASSERT_RESULT (peers.front ()->shared_tablet_safe ());
1976+ ASSERT_OK (InvokeSplitTabletRpcAndWaitForDataCompacted (cluster_.get (), tablet->tablet_id ()));
1977+
1978+ // Wait until parent tablet got cleaned up.
1979+ ASSERT_OK (LoggedWaitFor (
1980+ [cluster = cluster_.get ()]{
1981+ return ListTabletPeers (cluster, ListPeersFilter::kAll ).size () == 2 ;
1982+ }, 60s, " Parent tablet cleanup" ));
1983+
1984+ // Total number of compactions equals to a sum of number of post split compaction iterations and
1985+ // one background compaction. Number of post split compaction iterations equals to the number of
1986+ // parent files plus one empty iteration to indicate post split compaction completion.
1987+ constexpr size_t kNumParentFiles = kNumFiles + 1 ; // One more file due to an explicit flush.
1988+ constexpr size_t kNumPostSplitCompactionIterations = kNumParentFiles + 1 ;
1989+ constexpr size_t kNumBackgroundCompactions = 1 ;
1990+ constexpr size_t kNumExpectedCompactions =
1991+ kNumPostSplitCompactionIterations + kNumBackgroundCompactions ;
1992+
1993+ // Postpone status check for logging children files.
1994+ auto status = WaitForNumCompactionsPerDb (kNumExpectedCompactions );
1995+
1996+ // Make sure child tablets do not have parent files.
1997+ dbs = GetAllRocksDbs (cluster_.get (), /* include_intents = */ false );
1998+ ASSERT_EQ (dbs.size (), 2 );
1999+ for (auto * db : dbs) {
2000+ const auto files = db->GetLiveFilesMetaData ();
2001+ LOG (INFO) << " Child files: " << files_ids (files);
2002+ ASSERT_LT (parent_max_file_id, min_file_id (files));
2003+ }
2004+
2005+ ASSERT_OK (status);
2006+ }
2007+
18492008class FullCompactionMonitoringTest : public CompactionTest {
18502009 protected:
18512010 void SetUp () override {
@@ -2183,5 +2342,4 @@ TEST_F(CompactionTest, RemoveCorruptDataBlocks) {
21832342 ASSERT_LE (num_keys_lost, num_max_corrupt_keys_estimate);
21842343}
21852344
2186- } // namespace tserver
2187- } // namespace yb
2345+ } // namespace yb::tserver
0 commit comments