@@ -396,6 +396,25 @@ class ControlScript(object):
396396 # exit no matter what
397397 os ._exit (os .EX_OK )
398398
399+ def start_first_master_tserver (self , master_addresses ):
400+ self .processes .get ("master" ).start ()
401+ if not self .setup_master ():
402+ # TODO(sanketh): Make these all throw exceptions instead of excns + return values
403+ return "Failed to start master {}" .format (SCRIPT_NAME )
404+
405+ self .update_tserver_master_addrs ()
406+ self .processes .get ("tserver" ).start ()
407+ if not self .wait_tserver ():
408+ return "Failed to start tserver {}" .format (SCRIPT_NAME )
409+
410+ universe_uuid = YBAdminProxy .get_cluster_uuid (master_addresses )
411+ if universe_uuid and universe_uuid != self .configs .saved_data ["universe_uuid" ]:
412+ self .configs .saved_data ["universe_uuid" ] = universe_uuid
413+ self .configs .save_configs ()
414+
415+ return None
416+
417+
399418 # Starts yb-master, yb-tserver, and yugaware processes.
400419 # After initializing, creates a callhome thread.
401420 def start_processes (self ):
@@ -529,38 +548,28 @@ class ControlScript(object):
529548 # Start or initialize yb-master and yb-tserver.
530549 if is_first_run :
531550 Output .init_animation ("Running system checks..." )
532- self .post_install_yb ()
533551
534- for name in ("master" , "tserver" ):
535- process = self .processes .get (name )
536- process .remove_error_logs ()
537- if not process .is_running ():
538- if not is_first_run :
539- Output .log (
540- "{} died unexpectedly. Restarting..." .format (process .name ),
541- logging .ERROR )
542- if name == "tserver" :
543- self .update_tserver_master_addrs ()
544- process .start ()
545- should_callhome = True
546-
547- if is_first_run :
548- if not self .wait_tserver ():
549- Output .update_animation ("Database failed to start" ,
550- status = Output .ANIMATION_FAIL )
551- Output .log_error_and_exit ("Failed to start tserver {}" .format (SCRIPT_NAME ))
552+ self .post_install_yb ()
552553
553- if not self .setup_master ():
554+ ret = self .start_first_master_tserver (master_addresses )
555+ if ret :
554556 Output .update_animation ("Database failed to start" ,
555557 status = Output .ANIMATION_FAIL )
556- Output .log_error_and_exit ("Failed to start master {}" .format (SCRIPT_NAME ))
557-
558- universe_uuid = YBAdminProxy .get_cluster_uuid (master_addresses )
559- if universe_uuid and universe_uuid != self .configs .saved_data ["universe_uuid" ]:
560- self .configs .saved_data ["universe_uuid" ] = universe_uuid
561- self .configs .save_configs ()
558+ Output .log_error_and_exit (ret )
562559
563560 Output .update_animation ("System checks" )
561+ else :
562+ for name in ("master" , "tserver" ):
563+ process = self .processes .get (name )
564+ process .remove_error_logs ()
565+ if not process .is_running ():
566+ Output .log (
567+ "{} died unexpectedly. Restarting..." .format (process .name ),
568+ logging .ERROR )
569+ if name == "tserver" :
570+ self .update_tserver_master_addrs ()
571+ process .start ()
572+ should_callhome = True
564573
565574 if self .configs .temp_data .get ("ui" ):
566575 (_ , was_started ) = self .maybe_start_yw (is_first_run , is_first_install )
@@ -807,9 +816,8 @@ class ControlScript(object):
807816 master_addr = "{}:{}" .format (master_ip ,
808817 self .configs .saved_data .get ("master_rpc_port" ))
809818
810- if (not self .processes .get ("master" ).is_running ()
811- or not self .processes .get ("tserver" ).is_running ()):
812- Output .log ("Failed waiting for yb-master/tserver... process died." , logging .ERROR )
819+ if (not self .processes .get ("master" ).is_running ()):
820+ Output .log ("Failed waiting for yb-master... process died." , logging .ERROR )
813821 raise RuntimeError ("process died unexpectedly." )
814822
815823 cur_master_uuids = [ m [0 ] for m in YBAdminProxy .get_masters (master_addr ) ]
@@ -930,6 +938,9 @@ class ControlScript(object):
930938 Output .log ("Failed to wait for tserver." )
931939 return False
932940
941+ # In a multi-node cluster, the tserver initially knows just about its own master and the
942+ # master it is joining. After the cluster is formed, this method will attempt to
943+ # refresh the full list of masters so that the tserver can become aware of other masters.
933944 def update_tserver_master_addrs (self ):
934945 tserver_cmd = self .processes ["tserver" ].cmd
935946 master_flag = [ flag for flag in tserver_cmd if flag .find (TS_MASTER_ADDRS_FLAG ) >= 0 ]
0 commit comments