Skip to content

Commit 2f485f5

Browse files
[yugabyte#8254] No leader lease needed for BackfillIndex
Summary: When backfill index was designed, the master was supposed to spawn out the backfill tasks to the tservers. This was done by selecting the leader tablet for each tablet. However, it is not required that the backfill be done at the leader. Any of the followers that is past the backfilling_time can perform the backfill task; This diff allows followers (i.e. non-leaders) to also perform backfill. While the design is still to perform the backfill at the leader, this change is useful to ensure that backfill is not stuck/gets timed-out in case there is a leadership change. Test Plan: ybd --cxx-test integration-tests_cassandra_cpp_driver-test --gtest_filter CppCassandraDriverTest.TestTableBackfillWithLeaderMoves -n 10 --tp 1 Reviewers: bogdan, jason Reviewed By: jason Subscribers: ybase Differential Revision: https://phabricator.dev.yugabyte.com/D11688
1 parent 5fc9ce1 commit 2f485f5

File tree

4 files changed

+64
-6
lines changed

4 files changed

+64
-6
lines changed

src/yb/integration-tests/cassandra_cpp_driver-test.cc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,49 @@ class CppCassandraDriverTestIndexMultipleChunks : public CppCassandraDriverTestI
266266
}
267267
};
268268

269+
class CppCassandraDriverTestIndexMultipleChunksWithLeaderMoves
270+
: public CppCassandraDriverTestIndexMultipleChunks {
271+
public:
272+
std::vector<std::string> ExtraMasterFlags() override {
273+
auto flags = CppCassandraDriverTestIndex::ExtraMasterFlags();
274+
flags.push_back("--enable_load_balancing=true");
275+
flags.push_back("--index_backfill_rpc_max_retries=0");
276+
// We do not want backfill to fail because of any throttling.
277+
flags.push_back("--index_backfill_rpc_timeout_ms=180000");
278+
return flags;
279+
}
280+
281+
std::vector<std::string> ExtraTServerFlags() override {
282+
auto flags = CppCassandraDriverTestIndex::ExtraTServerFlags();
283+
flags.push_back("--backfill_index_rate_rows_per_sec=10");
284+
flags.push_back("--backfill_index_write_batch_size=2");
285+
return flags;
286+
}
287+
288+
void SetUp() override {
289+
CppCassandraDriverTestIndex::SetUp();
290+
thread_holder_.AddThreadFunctor([this] {
291+
const auto kNumTServers = cluster_->num_tablet_servers();
292+
constexpr auto kSleepTimeMs = 5000;
293+
for (int i = 0; !thread_holder_.stop_flag(); i++) {
294+
const auto tserver_id = i % kNumTServers;
295+
ASSERT_OK(cluster_->AddTServerToLeaderBlacklist(
296+
cluster_->master(), cluster_->tablet_server(tserver_id)));
297+
SleepFor(MonoDelta::FromMilliseconds(kSleepTimeMs));
298+
ASSERT_OK(cluster_->EmptyBlacklist(cluster_->master()));
299+
}
300+
});
301+
}
302+
303+
void TearDown() override {
304+
thread_holder_.Stop();
305+
CppCassandraDriverTestIndex::TearDown();
306+
}
307+
308+
private:
309+
TestThreadHolder thread_holder_;
310+
};
311+
269312
class CppCassandraDriverTestIndexSlowBackfill : public CppCassandraDriverTestIndex {
270313
public:
271314
std::vector<std::string> ExtraMasterFlags() override {
@@ -1862,6 +1905,13 @@ TEST_F_EX(CppCassandraDriverTest, TestTableBackfillInChunks,
18621905
IncludeAllColumns::kTrue, UserEnforced::kFalse);
18631906
}
18641907

1908+
TEST_F_EX(
1909+
CppCassandraDriverTest, TestTableBackfillWithLeaderMoves,
1910+
CppCassandraDriverTestIndexMultipleChunksWithLeaderMoves) {
1911+
TestBackfillIndexTable(
1912+
this, PKOnlyIndex::kFalse, IsUnique::kFalse, IncludeAllColumns::kTrue, UserEnforced::kFalse);
1913+
}
1914+
18651915
TEST_F_EX(CppCassandraDriverTest, TestTableBackfillUniqueInChunks,
18661916
CppCassandraDriverTestIndexMultipleChunks) {
18671917
TestBackfillIndexTable(this, PKOnlyIndex::kFalse, IsUnique::kTrue,

src/yb/master/async_rpc_tasks.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ bool RetryingTSRpcTask::RescheduleWithBackoffDelay() {
321321
}
322322

323323
LOG_WITH_PREFIX(INFO) << "Scheduling retry with a delay of " << delay_millis
324-
<< "ms (attempt = " << attempt_ << ")...";
324+
<< "ms (attempt = " << attempt_ << " / " << attempt_threshold << ")...";
325325

326326
if (!PerformStateTransition(task_state, MonitoredTaskState::kScheduling)) {
327327
LOG_WITH_PREFIX(WARNING) << "Unable to mark this task as MonitoredTaskState::kScheduling";

src/yb/master/backfill_index.cc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,9 +1370,14 @@ void BackfillChunk::UnregisterAsyncTaskCallback() {
13701370

13711371
if (resp_.has_error()) {
13721372
status = StatusFromPB(resp_.error().status());
1373-
for (int i = 0; i < resp_.failed_index_ids_size(); i++) {
1374-
VLOG(1) << " Added to failed index " << resp_.failed_index_ids(i);
1375-
failed_indexes.insert(resp_.failed_index_ids(i));
1373+
if (resp_.failed_index_ids_size() > 0) {
1374+
for (int i = 0; i < resp_.failed_index_ids_size(); i++) {
1375+
VLOG(1) << " Added to failed index " << resp_.failed_index_ids(i);
1376+
failed_indexes.insert(resp_.failed_index_ids(i));
1377+
}
1378+
} else {
1379+
// No specific index was marked as a failure. So consider all of them as failed.
1380+
failed_indexes = indexes_being_backfilled_;
13761381
}
13771382
} else if (state() != MonitoredTaskState::kComplete) {
13781383
// There is no response, so the error happened even before we could

src/yb/tserver/tablet_service.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -684,9 +684,12 @@ void TabletServiceAdminImpl::BackfillIndex(
684684

685685
// Wait for SafeTime to get past read_at;
686686
const HybridTime read_at(req->read_at_hybrid_time());
687-
const auto safe_time = tablet.peer->tablet()->SafeTime(
688-
tablet::RequireLease::kTrue, read_at, deadline);
687+
DVLOG(1) << "Waiting for safe time to be past " << read_at;
688+
const auto safe_time =
689+
tablet.peer->tablet()->SafeTime(tablet::RequireLease::kFalse, read_at, deadline);
690+
DVLOG(1) << "Got safe time " << safe_time.ToString();
689691
if (!safe_time.ok()) {
692+
LOG(ERROR) << "Could not get a good enough safe time " << safe_time.ToString();
690693
SetupErrorAndRespond(
691694
resp->mutable_error(),
692695
safe_time.status(),

0 commit comments

Comments
 (0)