Skip to content

Commit e96c786

Browse files
committed
[docdb] yb-master fails to restart after errors during first run (yugabyte#5276)
Summary: Before we depended on the `is_first_run` flag (which is set upon creation of the instance file) in order to determine how we would initialize the system catalog metadata. Since this flag is not very trustworthy if there are errors between instance file creation and sys catalog init, this revision removes this dependency - Instead we now will do a load or create of the sys catalog metadata. Also removing temporary fix from yugabyte#4866. Test Plan: Test examples from the github issue: ``` # Creation of instance file succeeds but sys catalog creation fails yb-master --fs_data_dirs=/tmp/testcrash1 --rpc_bind_addresses=127.0.0.1:7100 --master_addresses=127.0.0.2:7100 --replication_factor=1 # Should succeed now: yb-master --fs_data_dirs=/tmp/testcrash1 --rpc_bind_addresses=127.0.0.1:7100 --master_addresses=127.0.0.1:7100 --replication_factor=1 ``` --- Also testing example from issue yugabyte#4866: ``` rm -rf /tmp/foo1 mkdir -p /tmp/foo1 python3 -m http.server 7100 --bind 127.0.0.1 & yb-master --fs_data_dirs=/tmp/foo1 --rpc_bind_addresses=127.0.0.1:7100 --master_addresses=127.0.0.1:7100 --replication_factor=1 kill %1 yb-master --fs_data_dirs=/tmp/foo1 --rpc_bind_addresses=127.0.0.1:7100 --master_addresses=127.0.0.1:7100 --replication_factor=1 # Still works ``` --- Also adding `ybd --cxx-test master-test --gtest_filter MasterTest.TestNetworkErrorOnFirstRun` Reviewers: sanketh, bogdan, rahuldesirazu Reviewed By: rahuldesirazu Subscribers: ybase Differential Revision: https://phabricator.dev.yugabyte.com/D9240
1 parent d916f3b commit e96c786

File tree

6 files changed

+39
-20
lines changed

6 files changed

+39
-20
lines changed

src/yb/master/catalog_manager.cc

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ CatalogManager::~CatalogManager() {
561561
Shutdown();
562562
}
563563

564-
Status CatalogManager::Init(bool is_first_run) {
564+
Status CatalogManager::Init() {
565565
{
566566
std::lock_guard<simple_spinlock> l(state_lock_);
567567
CHECK_EQ(kConstructed, state_);
@@ -575,7 +575,7 @@ Status CatalogManager::Init(bool is_first_run) {
575575
metric_num_tablet_servers_dead_ =
576576
METRIC_num_tablet_servers_dead.Instantiate(master_->metric_entity_cluster(), 0);
577577

578-
RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(is_first_run),
578+
RETURN_NOT_OK_PREPEND(InitSysCatalogAsync(),
579579
"Failed to initialize sys tables async");
580580

581581
if (PREDICT_FALSE(FLAGS_TEST_simulate_slow_system_tablet_bootstrap_secs > 0)) {
@@ -1289,9 +1289,16 @@ Status CatalogManager::CheckLocalHostInMasterAddresses() {
12891289
master_->opts().master_addresses_flag);
12901290
}
12911291

1292-
Status CatalogManager::InitSysCatalogAsync(bool is_first_run) {
1292+
Status CatalogManager::InitSysCatalogAsync() {
12931293
std::lock_guard<LockType> l(lock_);
1294-
if (is_first_run) {
1294+
1295+
// Optimistically try to load data from disk.
1296+
Status s = sys_catalog_->Load(master_->fs_manager());
1297+
1298+
if (!s.ok() && s.IsNotFound()) {
1299+
// We are on our first run, need to create the metadata file.
1300+
LOG(INFO) << "Did not find previous SysCatalogTable data on disk";
1301+
12951302
if (!master_->opts().AreMasterAddressesProvided()) {
12961303
master_->SetShellMode(true);
12971304
LOG(INFO) << "Starting master in shell mode.";
@@ -1300,10 +1307,11 @@ Status CatalogManager::InitSysCatalogAsync(bool is_first_run) {
13001307

13011308
RETURN_NOT_OK(CheckLocalHostInMasterAddresses());
13021309
RETURN_NOT_OK(sys_catalog_->CreateNew(master_->fs_manager()));
1303-
} else {
1304-
RETURN_NOT_OK(sys_catalog_->Load(master_->fs_manager()));
1310+
1311+
return Status::OK();
13051312
}
1306-
return Status::OK();
1313+
1314+
return s;
13071315
}
13081316

13091317
bool CatalogManager::IsInitialized() const {

src/yb/master/catalog_manager.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ class CatalogManager : public tserver::TabletPeerLookupIf {
170170
explicit CatalogManager(Master *master);
171171
virtual ~CatalogManager();
172172

173-
CHECKED_STATUS Init(bool is_first_run);
173+
CHECKED_STATUS Init();
174174

175175
void Shutdown();
176176
CHECKED_STATUS CheckOnline() const;
@@ -826,7 +826,7 @@ class CatalogManager : public tserver::TabletPeerLookupIf {
826826
// sys_catalog_.
827827
//
828828
// This method is thread-safe.
829-
CHECKED_STATUS InitSysCatalogAsync(bool is_first_run);
829+
CHECKED_STATUS InitSysCatalogAsync();
830830

831831
// Helper for creating the initial TableInfo state
832832
// Leaves the table "write locked" with the new info in the

src/yb/master/master-test.cc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ DECLARE_bool(TEST_hang_on_namespace_transition);
6767
DECLARE_bool(TEST_simulate_crash_after_table_marked_deleting);
6868
DECLARE_int32(TEST_sys_catalog_write_rejection_percentage);
6969
DECLARE_bool(TEST_tablegroup_master_only);
70+
DECLARE_bool(TEST_simulate_port_conflict_error);
7071

7172
namespace yb {
7273
namespace master {
@@ -1804,6 +1805,18 @@ TEST_F(MasterTest, TestFailedMasterRestart) {
18041805
ASSERT_OK(mini_master_->Start());
18051806
}
18061807

1808+
TEST_F(MasterTest, TestNetworkErrorOnFirstRun) {
1809+
TearDown();
1810+
mini_master_.reset(new MiniMaster(Env::Default(), GetTestPath("Master-test"),
1811+
AllocateFreePort(), AllocateFreePort(), 0));
1812+
FLAGS_TEST_simulate_port_conflict_error = true;
1813+
ASSERT_NOK(mini_master_->Start());
1814+
// Instance file should be properly initialized, but consensus metadata is not initialized.
1815+
FLAGS_TEST_simulate_port_conflict_error = false;
1816+
// Restarting master should succeed.
1817+
ASSERT_OK(mini_master_->Start());
1818+
}
1819+
18071820
static void GetTableSchema(const char* table_name,
18081821
const char* namespace_name,
18091822
const Schema* kSchema,

src/yb/master/master.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ Status Master::InitCatalogManager() {
273273
if (catalog_manager_->IsInitialized()) {
274274
return STATUS(IllegalState, "Catalog manager is already initialized");
275275
}
276-
RETURN_NOT_OK_PREPEND(catalog_manager_->Init(is_first_run_),
276+
RETURN_NOT_OK_PREPEND(catalog_manager_->Init(),
277277
"Unable to initialize catalog manager");
278278
return Status::OK();
279279
}

src/yb/server/server_base.cc

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
#include "yb/util/monotime.h"
6868
#include "yb/util/net/sockaddr.h"
6969
#include "yb/util/net/net_util.h"
70+
#include "yb/util/status.h"
7071
#include "yb/util/user.h"
7172
#include "yb/util/pb_util.h"
7273
#include "yb/util/rolling_log.h"
@@ -98,6 +99,9 @@ DEFINE_bool(TEST_check_broadcast_address, true, "Break connectivity in test mini
9899

99100
DEFINE_test_flag(string, public_hostname_suffix, ".ip.yugabyte", "Suffix for public hostnames.");
100101

102+
DEFINE_test_flag(bool, simulate_port_conflict_error, false,
103+
"Simulate a port conflict error during initialization.");
104+
101105
using namespace std::literals;
102106
using namespace std::placeholders;
103107

@@ -157,7 +161,6 @@ RpcServerBase::RpcServerBase(string name, const ServerBaseOptions& options,
157161
metric_registry_(new MetricRegistry()),
158162
metric_entity_(METRIC_ENTITY_server.Instantiate(metric_registry_.get(),
159163
metric_namespace)),
160-
is_first_run_(false),
161164
options_(options),
162165
initialized_(false),
163166
stop_metrics_logging_latch_(1) {
@@ -461,21 +464,17 @@ Status RpcAndWebServerBase::Init() {
461464
if (s.IsNotFound() || (!s.ok() && fs_manager_->HasAnyLockFiles())) {
462465
LOG(INFO) << "Could not load existing FS layout: " << s.ToString();
463466
LOG(INFO) << "Creating new FS layout";
464-
is_first_run_ = true;
465467
RETURN_NOT_OK_PREPEND(fs_manager_->CreateInitialFileSystemLayout(true),
466468
"Could not create new FS layout");
467469
s = fs_manager_->Open();
468470
}
469471
RETURN_NOT_OK_PREPEND(s, "Failed to load FS layout");
470472

471-
s = RpcServerBase::Init();
472-
if (!s.ok() && is_first_run_) {
473-
// TODO (julien) : Remove this once #5276 is fixed.
474-
LOG(ERROR) << "Encountered an error, deleting FS files in order to reset run state: " << s;
475-
RETURN_NOT_OK_PREPEND(fs_manager_->DeleteFileSystemLayout(),
476-
"Failed deleting FS layout after RPCServerBase init failed.");
473+
if (PREDICT_FALSE(FLAGS_TEST_simulate_port_conflict_error)) {
474+
return STATUS(NetworkError, "Simulated port conflict error");
477475
}
478-
RETURN_NOT_OK_PREPEND(s, "Failed to initialize FS layout");
476+
477+
RETURN_NOT_OK(RpcServerBase::Init());
479478

480479
return Status::OK();
481480
}

src/yb/server/server_base.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ class RpcServerBase {
130130
gscoped_ptr<RpcServer> rpc_server_;
131131
std::unique_ptr<rpc::Messenger> messenger_;
132132
std::unique_ptr<rpc::ProxyCache> proxy_cache_;
133-
bool is_first_run_;
134133

135134
scoped_refptr<Clock> clock_;
136135

0 commit comments

Comments
 (0)