Skip to content

Commit d916f3b

Browse files
committed
[yugabyte#5304] [yugabyted] Show a better error message when master fails to start
Summary: After the introduction of update_tserver_master_addrs() in yugabyte@fa5862a, this method is the first step that fails after a master fails to come up. This makes the error message hard to understand and also makes the user wait for 60s or so. Instead, we do the old logic of starting the master, waiting until it is up, and then the tserver. Test Plan: Run yugabyted in the error scenario of the bug - passing gibberish port to rpc_bind_addresses simulates the original bug. Run yugabyted in regular single node scenario. Reviewers: wesley Reviewed By: wesley Subscribers: yugaware Differential Revision: https://phabricator.dev.yugabyte.com/D9123
1 parent f935dc8 commit d916f3b

File tree

1 file changed

+40
-29
lines changed

1 file changed

+40
-29
lines changed

bin/yugabyted

Lines changed: 40 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,25 @@ class ControlScript(object):
396396
# exit no matter what
397397
os._exit(os.EX_OK)
398398

399+
def start_first_master_tserver(self, master_addresses):
400+
self.processes.get("master").start()
401+
if not self.setup_master():
402+
# TODO(sanketh): Make these all throw exceptions instead of excns + return values
403+
return "Failed to start master {}".format(SCRIPT_NAME)
404+
405+
self.update_tserver_master_addrs()
406+
self.processes.get("tserver").start()
407+
if not self.wait_tserver():
408+
return "Failed to start tserver {}".format(SCRIPT_NAME)
409+
410+
universe_uuid = YBAdminProxy.get_cluster_uuid(master_addresses)
411+
if universe_uuid and universe_uuid != self.configs.saved_data["universe_uuid"]:
412+
self.configs.saved_data["universe_uuid"] = universe_uuid
413+
self.configs.save_configs()
414+
415+
return None
416+
417+
399418
# Starts yb-master, yb-tserver, and yugaware processes.
400419
# After initializing, creates a callhome thread.
401420
def start_processes(self):
@@ -529,38 +548,28 @@ class ControlScript(object):
529548
# Start or initialize yb-master and yb-tserver.
530549
if is_first_run:
531550
Output.init_animation("Running system checks...")
532-
self.post_install_yb()
533551

534-
for name in ("master", "tserver"):
535-
process = self.processes.get(name)
536-
process.remove_error_logs()
537-
if not process.is_running():
538-
if not is_first_run:
539-
Output.log(
540-
"{} died unexpectedly. Restarting...".format(process.name),
541-
logging.ERROR)
542-
if name == "tserver":
543-
self.update_tserver_master_addrs()
544-
process.start()
545-
should_callhome = True
546-
547-
if is_first_run:
548-
if not self.wait_tserver():
549-
Output.update_animation("Database failed to start",
550-
status=Output.ANIMATION_FAIL)
551-
Output.log_error_and_exit("Failed to start tserver {}".format(SCRIPT_NAME))
552+
self.post_install_yb()
552553

553-
if not self.setup_master():
554+
ret = self.start_first_master_tserver(master_addresses)
555+
if ret:
554556
Output.update_animation("Database failed to start",
555557
status=Output.ANIMATION_FAIL)
556-
Output.log_error_and_exit("Failed to start master {}".format(SCRIPT_NAME))
557-
558-
universe_uuid = YBAdminProxy.get_cluster_uuid(master_addresses)
559-
if universe_uuid and universe_uuid != self.configs.saved_data["universe_uuid"]:
560-
self.configs.saved_data["universe_uuid"] = universe_uuid
561-
self.configs.save_configs()
558+
Output.log_error_and_exit(ret)
562559

563560
Output.update_animation("System checks")
561+
else:
562+
for name in ("master", "tserver"):
563+
process = self.processes.get(name)
564+
process.remove_error_logs()
565+
if not process.is_running():
566+
Output.log(
567+
"{} died unexpectedly. Restarting...".format(process.name),
568+
logging.ERROR)
569+
if name == "tserver":
570+
self.update_tserver_master_addrs()
571+
process.start()
572+
should_callhome = True
564573

565574
if self.configs.temp_data.get("ui"):
566575
(_, was_started) = self.maybe_start_yw(is_first_run, is_first_install)
@@ -807,9 +816,8 @@ class ControlScript(object):
807816
master_addr = "{}:{}".format(master_ip,
808817
self.configs.saved_data.get("master_rpc_port"))
809818

810-
if (not self.processes.get("master").is_running()
811-
or not self.processes.get("tserver").is_running()):
812-
Output.log("Failed waiting for yb-master/tserver... process died.", logging.ERROR)
819+
if (not self.processes.get("master").is_running()):
820+
Output.log("Failed waiting for yb-master... process died.", logging.ERROR)
813821
raise RuntimeError("process died unexpectedly.")
814822

815823
cur_master_uuids = [ m[0] for m in YBAdminProxy.get_masters(master_addr) ]
@@ -930,6 +938,9 @@ class ControlScript(object):
930938
Output.log("Failed to wait for tserver.")
931939
return False
932940

941+
# In a multi-node cluster, the tserver initially knows just about its own master and the
942+
# master it is joining. After the cluster is formed, this method will attempt to
943+
# refresh the full list of masters so that the tserver can become aware of other masters.
933944
def update_tserver_master_addrs(self):
934945
tserver_cmd = self.processes["tserver"].cmd
935946
master_flag = [ flag for flag in tserver_cmd if flag.find(TS_MASTER_ADDRS_FLAG) >= 0 ]

0 commit comments

Comments
 (0)