Skip to content

Commit 7d60edf

Browse files
committed
[yugabyte#8204] ybase: GetLoadMoveCompletionPercent returns an incorrect 100% if tservers haven't
heartbeated their tablet reports Summary: Currently, if tablet servers don't check-in their tablets after a master leader failover, the GetLoadMoveCompletionPercent incorrectly returns a 100 which can be catastrophic. If we don't find any load on the blacklisted tservers then it can be because the tservers are yet to heartbeat their tablet reports to this master. We shouldn't give a false presumption that the load transfer is complete. We expect that by load_balancer_initial_delay_secs time, this should go away and if the load is reported as 0 on the blacklisted tservers after this time then it means that the transfer is successfully complete. Test Plan: ybd --cxx-test master_failover-itest --gtest_filter MasterFailoverTest.TestLoadMoveCompletion Reviewers: nicolas, bogdan, rahuldesirazu Reviewed By: rahuldesirazu Subscribers: ybase Differential Revision: https://phabricator.dev.yugabyte.com/D11613
1 parent 2b8064d commit 7d60edf

File tree

4 files changed

+57
-9
lines changed

4 files changed

+57
-9
lines changed

src/yb/integration-tests/master_failover-itest.cc

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -496,18 +496,16 @@ TEST_F(MasterFailoverTest, TestLoadMoveCompletion) {
496496
MonoDelta::FromSeconds(60),
497497
"Load Balancer Idle check failed"));
498498

499-
// Delay TS Heartbeats by 40 times of original rate.
500-
ASSERT_OK(cluster_->SetFlagOnTServers("heartbeat_interval_ms",
501-
std::to_string(kHeartbeatIntervalMs * 40)));
502-
503-
// Wait for the delay to take effect.
504-
// Approximately let's give it 4 cycles.
505-
SleepFor(MonoDelta::FromMilliseconds(4 * kHeartbeatIntervalMs));
499+
// Disable TS heartbeats.
500+
LOG(INFO) << "Disabled Heartbeats";
501+
ASSERT_OK(cluster_->SetFlagOnTServers("tserver_disable_heartbeat_test_only",
502+
"true"));
506503

507504
// Blacklist a TS.
508505
ExternalMaster *leader = cluster_->GetLeaderMaster();
509506
ExternalTabletServer *ts = cluster_->tablet_server(3);
510507
ASSERT_OK(cluster_->AddTServerToBlacklist(leader, ts));
508+
LOG(INFO) << "Blacklisted tserver#3";
511509

512510
// Get the initial load.
513511
int idx = -1;
@@ -523,6 +521,7 @@ TEST_F(MasterFailoverTest, TestLoadMoveCompletion) {
523521
int initial_total_load = resp.total();
524522

525523
// Failover the leader.
524+
LOG(INFO) << "Failing over master leader.";
526525
ASSERT_OK(cluster_->StepDownMasterLeaderAndWaitForNewLeader());
527526

528527
// Get the final load and validate.
@@ -534,9 +533,35 @@ TEST_F(MasterFailoverTest, TestLoadMoveCompletion) {
534533

535534
proxy = cluster_->master_proxy(idx);
536535
ASSERT_OK(proxy->GetLoadMoveCompletion(req, &resp, &rpc));
536+
LOG(INFO) << "Initial loads. Before master leader failover: " << initial_total_load
537+
<< " v/s after master leader failover: " << resp.total();
537538

538539
EXPECT_EQ(resp.total(), initial_total_load) << "Expected the initial blacklisted load"
539540
" to be propagated to new leader master.";
541+
542+
// The progress should be reported as 0 until tservers heartbeat
543+
// their tablet reports.
544+
EXPECT_EQ(resp.percent(), 0) << "Expected the initial progress"
545+
" to be zero.";
546+
547+
// Now enable heartbeats.
548+
ASSERT_OK(cluster_->SetFlagOnTServers("tserver_disable_heartbeat_test_only",
549+
"false"));
550+
ASSERT_OK(cluster_->SetFlagOnMasters("blacklist_progress_initial_delay_secs",
551+
std::to_string((kHeartbeatIntervalMs * 20)/1000)));
552+
LOG(INFO) << "Enabled heartbeats";
553+
554+
ASSERT_OK(LoggedWaitFor(
555+
[&]() -> Result<bool> {
556+
req.Clear();
557+
resp.Clear();
558+
rpc.Reset();
559+
RETURN_NOT_OK(proxy->GetLoadMoveCompletion(req, &resp, &rpc));
560+
return resp.percent() >= 100;
561+
},
562+
MonoDelta::FromSeconds(300),
563+
"Waiting for blacklist load transfer to complete"
564+
));
540565
}
541566

542567
class MasterFailoverTestWithPlacement : public MasterFailoverTest {

src/yb/master/catalog_manager.cc

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,12 @@ TAG_FLAG(TEST_disable_setting_tablespace_id_at_creation, runtime);
374374
DEFINE_test_flag(bool, crash_server_on_sys_catalog_leader_affinity_move, false,
375375
"When set, crash the master process if it performs a sys catalog leader affinity "
376376
"move.");
377+
DEFINE_int32(blacklist_progress_initial_delay_secs, yb::master::kDelayAfterFailoverSecs,
378+
"When a master leader failsover, the time until which the progress of load movement "
379+
"off the blacklisted tservers is reported as 0. This initial delay "
380+
"gives sufficient time for heartbeats so that we don't report"
381+
" a premature incorrect completion.");
382+
TAG_FLAG(blacklist_progress_initial_delay_secs, runtime);
377383

378384
namespace yb {
379385
namespace master {
@@ -9098,18 +9104,34 @@ Status CatalogManager::GetLeaderBlacklistCompletionPercent(GetLoadMovePercentRes
90989104
}
90999105

91009106
Status CatalogManager::GetLoadMoveCompletionPercent(GetLoadMovePercentResponsePB* resp,
9101-
bool blacklist_leader) {
9107+
bool blacklist_leader) {
91029108
auto l = cluster_config_->LockForRead();
91039109

91049110
// Fine to pass in empty defaults if server_blacklist or leader_blacklist is not filled.
91059111
const BlacklistPB& state = blacklist_leader ? l->pb.leader_blacklist() : l->pb.server_blacklist();
91069112
int64_t blacklist_replicas = GetNumRelevantReplicas(state, blacklist_leader);
91079113
int64_t initial_load = (blacklist_leader) ?
91089114
state.initial_leader_load(): state.initial_replica_load();
9115+
// If we are starting up and don't find any load on the tservers, return progress as 0.
9116+
// We expect that by blacklist_progress_initial_delay_secs time, this should go away and if the
9117+
// load is reported as 0 on the blacklisted tservers after this time then it means that
9118+
// the transfer is successfully complete.
9119+
if (blacklist_replicas == 0 &&
9120+
TimeSinceElectedLeader() <= MonoDelta::FromSeconds(FLAGS_blacklist_progress_initial_delay_secs)) {
9121+
LOG(INFO) << "Master leadership has changed. Reporting progress as 0 until the catalog " <<
9122+
"manager gets the correct estimates of the remaining load on the blacklisted" <<
9123+
"tservers.";
9124+
resp->set_percent(0);
9125+
resp->set_total(initial_load);
9126+
resp->set_remaining(initial_load);
9127+
return Status::OK();
9128+
}
91099129

91109130
// On change of master leader, initial_load_ information may be lost temporarily. Reset to
91119131
// current value to avoid reporting progress percent as 100. Note that doing so will report
91129132
// progress percent as 0 instead.
9133+
// TODO(Sanket): This might be no longer relevant after we persist and load the initial load
9134+
// on failover. Need to investigate.
91139135
if (initial_load < blacklist_replicas) {
91149136
LOG(INFO) << Format("Initial load: $0, current load: $1."
91159137
" Initial load is less than the current load. Probably a master leader change."

src/yb/master/catalog_manager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ static const char* const kColocatedParentTableIdSuffix = ".colocated.parent.uuid
128128
static const char* const kColocatedParentTableNameSuffix = ".colocated.parent.tablename";
129129
static const char* const kTablegroupParentTableIdSuffix = ".tablegroup.parent.uuid";
130130
static const char* const kTablegroupParentTableNameSuffix = ".tablegroup.parent.tablename";
131+
static const int32 kDelayAfterFailoverSecs = 120;
131132

132133
using PlacementId = std::string;
133134

src/yb/master/catalog_manager_bg_tasks.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ DEFINE_int32(catalog_manager_bg_task_wait_ms, 1000,
5050
"between runs");
5151
TAG_FLAG(catalog_manager_bg_task_wait_ms, hidden);
5252

53-
DEFINE_int32(load_balancer_initial_delay_secs, 120,
53+
DEFINE_int32(load_balancer_initial_delay_secs, yb::master::kDelayAfterFailoverSecs,
5454
"Amount of time to wait between becoming master leader and enabling the load "
5555
"balancer.");
5656

0 commit comments

Comments
 (0)