Skip to content

Commit 82d5a82

Browse files
committed
[PLAT-13287] Add a local provider retry test for StartMaster
Summary: Added retry test in local provider framework. Test Plan: sbt test Reviewers: nsingh Reviewed By: nsingh Subscribers: svc_phabricator, yugaware Differential Revision: https://phorge.dev.yugabyte.com/D37979
1 parent 597dc3a commit 82d5a82

File tree

4 files changed

+80
-17
lines changed

4 files changed

+80
-17
lines changed

managed/src/main/java/com/yugabyte/yw/common/LocalNodeManager.java

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ public void shutdown() {
141141
killPostMasterProcess(baseDir);
142142
}
143143
log.debug("Destroying {}", process.pid());
144-
killProcess(process.pid());
144+
killProcess(process.pid(), false);
145145
} catch (Exception e) {
146146
log.error("Failed to destroy process " + process, e);
147147
}
@@ -202,16 +202,16 @@ private void killPostMasterProcess(String path) {
202202
}
203203
}
204204

205-
private void killProcess(long pid) throws IOException, InterruptedException {
205+
private void killProcess(long pid, boolean throwIfAbsent) {
206206
try {
207-
terminateProcessAndSubprocesses(pid);
207+
terminateProcessAndSubprocesses(pid, throwIfAbsent);
208208
} catch (SecurityException | IllegalArgumentException e) {
209209
System.err.println("Error occurred while terminating process: " + e.getMessage());
210210
e.printStackTrace();
211211
}
212212
}
213213

214-
private void terminateProcessAndSubprocesses(long pid) {
214+
private void terminateProcessAndSubprocesses(long pid, boolean throwIfAbsent) {
215215
ProcessHandle.of(pid)
216216
.ifPresentOrElse(
217217
process -> {
@@ -221,7 +221,9 @@ private void terminateProcessAndSubprocesses(long pid) {
221221
process.destroy();
222222
},
223223
() -> {
224-
throw new IllegalArgumentException("No such process with PID: " + pid);
224+
if (throwIfAbsent) {
225+
throw new IllegalArgumentException("No such process with PID: " + pid);
226+
}
225227
});
226228
}
227229

@@ -233,7 +235,7 @@ public void killProcess(String nodeName, UniverseTaskBase.ServerType serverType)
233235
Process process = nodeInfo.processMap.get(serverType);
234236
if (process != null) {
235237
log.debug("Destroying process with pid {} for {}", process.pid(), nodeInfo.ip);
236-
killProcess(process.pid());
238+
killProcess(process.pid(), true);
237239
}
238240
}
239241
}
@@ -702,6 +704,13 @@ public void startProcessForNode(
702704
UniverseDefinitionTaskParams.UserIntent userIntent,
703705
UniverseTaskBase.ServerType serverType,
704706
NodeInfo nodeInfo) {
707+
if (nodeInfo.processMap.get(serverType) != null) {
708+
Process process = nodeInfo.processMap.get(serverType);
709+
if (process.isAlive()) {
710+
log.debug("Already have started process");
711+
return;
712+
}
713+
}
705714
List<String> args = new ArrayList<>();
706715
String executable;
707716
LocalCloudInfo localCloudInfo = getCloudInfo(userIntent);
@@ -756,12 +765,7 @@ private void stopProcessForNode(
756765
throw new IllegalStateException("No process of type " + serverType + " for " + nodeInfo.name);
757766
}
758767
log.debug("Killing process {}", process.pid());
759-
try {
760-
killProcess(process.pid());
761-
} catch (IOException | InterruptedException e) {
762-
System.err.println("Error occurred while terminating process: " + e.getMessage());
763-
e.printStackTrace();
764-
}
768+
killProcess(process.pid(), true);
765769
}
766770

767771
private static List<String> readProcessIdsFromFile(String filePath) {

managed/src/test/java/com/yugabyte/yw/commissioner/tasks/local/GFlagsUpgradeLocalTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ public void testNodesAreSafeToTakeDownFails() throws InterruptedException, IOExc
551551
assertNull(universe.getUniverseDetails().placementModificationTaskUuid);
552552

553553
// Revert setting
554-
RuntimeConfigEntry.upsertGlobal(UniverseConfKeys.followerLagMaxThreshold.getKey(), "30s");
554+
RuntimeConfigEntry.upsertGlobal(UniverseConfKeys.followerLagMaxThreshold.getKey(), "60s");
555555

556556
// Now it should be successful
557557
doGflagsUpgrade(

managed/src/test/java/com/yugabyte/yw/commissioner/tasks/local/LocalProviderUniverseTestBase.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@
2626
import com.yugabyte.yw.commissioner.tasks.subtasks.CheckClusterConsistency;
2727
import com.yugabyte.yw.common.ApiUtils;
2828
import com.yugabyte.yw.common.ConfigHelper;
29+
import com.yugabyte.yw.common.CustomerTaskManager;
2930
import com.yugabyte.yw.common.LocalNodeManager;
3031
import com.yugabyte.yw.common.LocalNodeUniverseManager;
3132
import com.yugabyte.yw.common.ModelFactory;
3233
import com.yugabyte.yw.common.NodeUIApiHelper;
3334
import com.yugabyte.yw.common.PlacementInfoUtil;
34-
import com.yugabyte.yw.common.PlatformGuiceApplicationBaseTest;
3535
import com.yugabyte.yw.common.ReleaseManager;
3636
import com.yugabyte.yw.common.RetryTaskUntilCondition;
3737
import com.yugabyte.yw.common.ShellResponse;
@@ -126,7 +126,7 @@
126126
import play.libs.Json;
127127

128128
@Slf4j
129-
public abstract class LocalProviderUniverseTestBase extends PlatformGuiceApplicationBaseTest {
129+
public abstract class LocalProviderUniverseTestBase extends CommissionerBaseTest {
130130
private static final boolean IS_LINUX = System.getProperty("os.name").equalsIgnoreCase("linux");
131131
private static final Set<String> CONTROL_FILES =
132132
Set.of(LocalNodeManager.MASTER_EXECUTABLE, LocalNodeManager.TSERVER_EXECUTABLE);
@@ -218,7 +218,6 @@ public Map<String, String> getYbcGFlags(UniverseDefinitionTaskParams.UserIntent
218218
protected YcqlQueryExecutor ycqlQueryExecutor;
219219
protected UniverseTableHandler tableHandler;
220220
protected CertificateHelper certificateHelper;
221-
protected Commissioner commissioner;
222221
protected SettableRuntimeConfigFactory settableRuntimeConfigFactory;
223222
protected RuntimeConfService runtimeConfService;
224223
protected JobScheduler jobScheduler;
@@ -427,13 +426,21 @@ private void injectDependencies() {
427426
runtimeConfService = app.injector().instanceOf(RuntimeConfService.class);
428427
jobScheduler = app.injector().instanceOf(JobScheduler.class);
429428
autoMasterFailoverScheduler = app.injector().instanceOf(AutoMasterFailoverScheduler.class);
429+
customerTaskManager = app.injector().instanceOf(CustomerTaskManager.class);
430430
}
431431

432432
@Before
433433
public void setUp() {
434434
injectDependencies();
435435

436436
settableRuntimeConfigFactory.globalRuntimeConf().setValue("yb.releases.use_redesign", "false");
437+
settableRuntimeConfigFactory
438+
.globalRuntimeConf()
439+
.setValue(GlobalConfKeys.startMasterOnRemoveNode.getKey(), "true");
440+
settableRuntimeConfigFactory
441+
.globalRuntimeConf()
442+
.setValue(GlobalConfKeys.startMasterOnStopNode.getKey(), "true");
443+
437444
settableRuntimeConfigFactory
438445
.globalRuntimeConf()
439446
.setValue("yb.universe.consistency_check.enabled", "true");

managed/src/test/java/com/yugabyte/yw/commissioner/tasks/local/NodeOperationsLocalTest.java

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,21 @@
1010
import com.fasterxml.jackson.databind.JsonNode;
1111
import com.yugabyte.yw.commissioner.tasks.CommissionerBaseTest;
1212
import com.yugabyte.yw.commissioner.tasks.UniverseTaskBase;
13+
import com.yugabyte.yw.commissioner.tasks.params.NodeTaskParams;
1314
import com.yugabyte.yw.common.FakeApiHelper;
1415
import com.yugabyte.yw.common.LocalNodeManager;
1516
import com.yugabyte.yw.common.NodeActionType;
17+
import com.yugabyte.yw.common.PlacementInfoUtil;
1618
import com.yugabyte.yw.common.gflags.SpecificGFlags;
1719
import com.yugabyte.yw.common.utils.Pair;
20+
import com.yugabyte.yw.controllers.UniverseControllerRequestBinder;
1821
import com.yugabyte.yw.forms.NodeActionFormData;
1922
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams;
23+
import com.yugabyte.yw.models.CustomerTask;
2024
import com.yugabyte.yw.models.TaskInfo;
2125
import com.yugabyte.yw.models.Universe;
2226
import com.yugabyte.yw.models.helpers.NodeDetails;
27+
import com.yugabyte.yw.models.helpers.TaskType;
2328
import java.util.List;
2429
import java.util.Map;
2530
import java.util.UUID;
@@ -110,6 +115,7 @@ public void testMasterNodeRemoval() throws InterruptedException {
110115
String nodeName = nodeDetails.nodeName;
111116
NodeActionFormData formData = new NodeActionFormData();
112117
formData.nodeAction = NodeActionType.STOP;
118+
NodeActionFormData.startMasterOnStopNode = true;
113119
Result result = nodeOperationInUniverse(universe.getUniverseUUID(), nodeName, formData);
114120
checkAndWaitForTask(result);
115121
universe = Universe.getOrBadRequest(universe.getUniverseUUID());
@@ -175,7 +181,53 @@ public void testRemoveReleaseNodeFromUniverse() throws InterruptedException {
175181
verifyUniverseState(universe);
176182
}
177183

178-
// @Test
184+
@Test
185+
public void testStartMasterRetries() throws InterruptedException {
186+
Universe universe = createUniverse(4, 3);
187+
Map<UUID, Integer> zoneToCount =
188+
PlacementInfoUtil.getAzUuidToNumNodes(
189+
universe.getUniverseDetails().getPrimaryCluster().placementInfo);
190+
191+
NodeDetails nodeWithMaster =
192+
universe.getUniverseDetails().nodeDetailsSet.stream()
193+
.filter(n -> n.isMaster && zoneToCount.get(n.azUuid) == 2)
194+
.findFirst()
195+
.get();
196+
NodeActionFormData.startMasterOnStopNode = false;
197+
NodeTaskParams taskParams =
198+
UniverseControllerRequestBinder.deepCopy(
199+
universe.getUniverseDetails(), NodeTaskParams.class);
200+
NodeActionType nodeActionType = NodeActionType.STOP;
201+
taskParams.nodeName = nodeWithMaster.getNodeName();
202+
UUID taskUUID = commissioner.submit(nodeActionType.getCommissionerTask(), taskParams);
203+
TaskInfo taskInfo = CommissionerBaseTest.waitForTask(taskUUID, 500);
204+
assertEquals(TaskInfo.State.Success, taskInfo.getTaskState());
205+
universe = Universe.getOrBadRequest(universe.getUniverseUUID());
206+
assertEquals(2, universe.getMasters().size());
207+
NodeDetails nodeWithoutMaster =
208+
universe.getUniverseDetails().nodeDetailsSet.stream()
209+
.filter(n -> !n.isMaster && !n.getNodeName().equals(nodeWithMaster.getNodeName()))
210+
.findFirst()
211+
.get();
212+
213+
taskParams =
214+
UniverseControllerRequestBinder.deepCopy(
215+
universe.getUniverseDetails(), NodeTaskParams.class);
216+
taskParams.nodeName = nodeWithoutMaster.getNodeName();
217+
218+
super.verifyTaskRetries(
219+
customer,
220+
CustomerTask.TaskType.StartMaster,
221+
CustomerTask.TargetType.Universe,
222+
universe.getUniverseUUID(),
223+
TaskType.StartMasterOnNode,
224+
taskParams,
225+
false);
226+
universe = Universe.getOrBadRequest(universe.getUniverseUUID());
227+
verifyUniverseState(universe);
228+
}
229+
230+
@Test
179231
public void testStartAlreadyStarted() throws InterruptedException {
180232
Universe universe = createUniverse(3, 1);
181233
NodeDetails nodeDetails =

0 commit comments

Comments
 (0)