Skip to content

Commit 80bbb29

Browse files
authored
CleanUp Async Jobs after mgmt server maintenance (apache#8394)
This PR fixes moves resources stuck in transition state during async job cleanup Problem: During maintenance of the management server, other servers in the cluster or the same server after a restart initiate async job cleanup. However, this process leaves resources in a transitional state. The only recovery option currently available is to make direct database changes. Solution: This PR introduces a resolution by changing Volume, Virtual Machine, and Network resources from their transitional states. This adjustment enables the reattempt of failed operations without the need for manual database modifications.
1 parent 8d42ca8 commit 80bbb29

File tree

9 files changed

+274
-50
lines changed

9 files changed

+274
-50
lines changed

api/src/main/java/com/cloud/storage/Volume.java

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -39,37 +39,49 @@ enum Type {
3939
};
4040

4141
enum State {
42-
Allocated("The volume is allocated but has not been created yet."),
43-
Creating("The volume is being created. getPoolId() should reflect the pool where it is being created."),
44-
Ready("The volume is ready to be used."),
45-
Migrating("The volume is migrating to other storage pool"),
46-
Snapshotting("There is a snapshot created on this volume, not backed up to secondary storage yet"),
47-
RevertSnapshotting("There is a snapshot created on this volume, the volume is being reverting from snapshot"),
48-
Resizing("The volume is being resized"),
49-
Expunging("The volume is being expunging"),
50-
Expunged("The volume has been expunged, and can no longer be recovered"),
51-
Destroy("The volume is destroyed, and can be recovered."),
52-
Destroying("The volume is destroying, and can't be recovered."),
53-
UploadOp("The volume upload operation is in progress or in short the volume is on secondary storage"),
54-
Copying("Volume is copying from image store to primary, in case it's an uploaded volume"),
55-
Uploaded("Volume is uploaded"),
56-
NotUploaded("The volume entry is just created in DB, not yet uploaded"),
57-
UploadInProgress("Volume upload is in progress"),
58-
UploadError("Volume upload encountered some error"),
59-
UploadAbandoned("Volume upload is abandoned since the upload was never initiated within a specified time"),
60-
Attaching("The volume is attaching to a VM from Ready state."),
61-
Restoring("The volume is being restored from backup.");
42+
Allocated(false, "The volume is allocated but has not been created yet."),
43+
Creating(true, "The volume is being created. getPoolId() should reflect the pool where it is being created."),
44+
Ready(false, "The volume is ready to be used."),
45+
Migrating(true, "The volume is migrating to other storage pool"),
46+
Snapshotting(true, "There is a snapshot created on this volume, not backed up to secondary storage yet"),
47+
RevertSnapshotting(true, "There is a snapshot created on this volume, the volume is being reverting from snapshot"),
48+
Resizing(true, "The volume is being resized"),
49+
Expunging(true, "The volume is being expunging"),
50+
Expunged(false, "The volume has been expunged, and can no longer be recovered"),
51+
Destroy(false, "The volume is destroyed, and can be recovered."),
52+
Destroying(false, "The volume is destroying, and can't be recovered."),
53+
UploadOp(true, "The volume upload operation is in progress or in short the volume is on secondary storage"),
54+
Copying(true, "Volume is copying from image store to primary, in case it's an uploaded volume"),
55+
Uploaded(false, "Volume is uploaded"),
56+
NotUploaded(true, "The volume entry is just created in DB, not yet uploaded"),
57+
UploadInProgress(true, "Volume upload is in progress"),
58+
UploadError(false, "Volume upload encountered some error"),
59+
UploadAbandoned(false, "Volume upload is abandoned since the upload was never initiated within a specified time"),
60+
Attaching(true, "The volume is attaching to a VM from Ready state."),
61+
Restoring(true, "The volume is being restored from backup.");
62+
63+
boolean _transitional;
6264

6365
String _description;
6466

65-
private State(String description) {
67+
/**
68+
* Volume State
69+
* @param transitional true for transition/non-final state, otherwise false
70+
* @param description description of the state
71+
*/
72+
private State(boolean transitional, String description) {
73+
_transitional = transitional;
6674
_description = description;
6775
}
6876

6977
public static StateMachine2<State, Event, Volume> getStateMachine() {
7078
return s_fsm;
7179
}
7280

81+
public boolean isTransitional() {
82+
return _transitional;
83+
}
84+
7385
public String getDescription() {
7486
return _description;
7587
}

engine/api/src/main/java/org/apache/cloudstack/engine/orchestration/service/NetworkOrchestrationService.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import com.cloud.offering.NetworkOffering;
5151
import com.cloud.user.Account;
5252
import com.cloud.user.User;
53+
import com.cloud.utils.fsm.NoTransitionException;
5354
import com.cloud.utils.Pair;
5455
import com.cloud.vm.Nic;
5556
import com.cloud.vm.NicProfile;
@@ -268,6 +269,8 @@ void implementNetworkElementsAndResources(DeployDestination dest, ReservationCon
268269

269270
Map<String, String> finalizeServicesAndProvidersForNetwork(NetworkOffering offering, Long physicalNetworkId);
270271

272+
boolean stateTransitTo(Network network, Network.Event e) throws NoTransitionException;
273+
271274
List<Provider> getProvidersForServiceInNetwork(Network network, Service service);
272275

273276
StaticNatServiceProvider getStaticNatProviderForNetwork(Network network);

engine/api/src/main/java/org/apache/cloudstack/engine/subsystem/api/storage/VolumeService.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
*/
1919
package org.apache.cloudstack.engine.subsystem.api.storage;
2020

21-
import com.cloud.agent.api.Answer;
2221
import java.util.Map;
2322

2423
import org.apache.cloudstack.engine.cloud.entity.api.VolumeEntity;
2524
import org.apache.cloudstack.framework.async.AsyncCallFuture;
2625
import org.apache.cloudstack.storage.command.CommandResult;
2726

27+
import com.cloud.agent.api.Answer;
2828
import com.cloud.agent.api.to.VirtualMachineTO;
2929
import com.cloud.exception.StorageAccessException;
3030
import com.cloud.host.Host;
@@ -35,6 +35,9 @@
3535
import com.cloud.utils.Pair;
3636

3737
public interface VolumeService {
38+
39+
String SNAPSHOT_ID = "SNAPSHOT_ID";
40+
3841
class VolumeApiResult extends CommandResult {
3942
private final VolumeInfo volume;
4043

engine/orchestration/src/main/java/org/apache/cloudstack/engine/orchestration/NetworkOrchestrator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4424,7 +4424,8 @@ public Map<String, String> getSystemVMAccessDetails(final VirtualMachine vm) {
44244424
return accessDetails;
44254425
}
44264426

4427-
protected boolean stateTransitTo(final NetworkVO network, final Network.Event e) throws NoTransitionException {
4427+
@Override
4428+
public boolean stateTransitTo(final Network network, final Network.Event e) throws NoTransitionException {
44284429
return _stateMachine.transitTo(network, e, null, _networksDao);
44294430
}
44304431

engine/storage/volume/src/main/java/org/apache/cloudstack/storage/volume/VolumeObject.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -851,7 +851,7 @@ public ProvisioningType getProvisioningType(){
851851

852852
@Override
853853
public boolean delete() {
854-
return dataStore == null ? true : dataStore.delete(this);
854+
return dataStore == null || dataStore.delete(this);
855855
}
856856

857857
@Override

engine/storage/volume/src/main/java/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@
3232

3333
import javax.inject.Inject;
3434

35-
import org.apache.cloudstack.secret.dao.PassphraseDao;
36-
import com.cloud.storage.VMTemplateVO;
37-
import com.cloud.storage.dao.VMTemplateDao;
38-
import com.cloud.storage.resource.StorageProcessor;
3935
import org.apache.cloudstack.annotation.AnnotationService;
4036
import org.apache.cloudstack.annotation.dao.AnnotationDao;
4137
import org.apache.cloudstack.engine.cloud.entity.api.VolumeEntity;
@@ -66,6 +62,7 @@
6662
import org.apache.cloudstack.framework.async.AsyncCompletionCallback;
6763
import org.apache.cloudstack.framework.async.AsyncRpcContext;
6864
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
65+
import org.apache.cloudstack.secret.dao.PassphraseDao;
6966
import org.apache.cloudstack.storage.RemoteHostEndPoint;
7067
import org.apache.cloudstack.storage.command.CommandResult;
7168
import org.apache.cloudstack.storage.command.CopyCmdAnswer;
@@ -83,6 +80,7 @@
8380
import org.apache.cloudstack.storage.image.store.TemplateObject;
8481
import org.apache.cloudstack.storage.to.TemplateObjectTO;
8582
import org.apache.cloudstack.storage.to.VolumeObjectTO;
83+
import org.apache.commons.lang3.StringUtils;
8684
import org.apache.log4j.Logger;
8785
import org.springframework.stereotype.Component;
8886

@@ -122,13 +120,16 @@
122120
import com.cloud.storage.VMTemplateStoragePoolVO;
123121
import com.cloud.storage.VMTemplateStorageResourceAssoc;
124122
import com.cloud.storage.VMTemplateStorageResourceAssoc.Status;
123+
import com.cloud.storage.VMTemplateVO;
125124
import com.cloud.storage.Volume;
126125
import com.cloud.storage.Volume.State;
127126
import com.cloud.storage.VolumeDetailVO;
128127
import com.cloud.storage.VolumeVO;
128+
import com.cloud.storage.dao.VMTemplateDao;
129129
import com.cloud.storage.dao.VMTemplatePoolDao;
130130
import com.cloud.storage.dao.VolumeDao;
131131
import com.cloud.storage.dao.VolumeDetailsDao;
132+
import com.cloud.storage.resource.StorageProcessor;
132133
import com.cloud.storage.snapshot.SnapshotApiService;
133134
import com.cloud.storage.snapshot.SnapshotManager;
134135
import com.cloud.storage.template.TemplateConstants;
@@ -142,7 +143,6 @@
142143
import com.cloud.utils.db.GlobalLock;
143144
import com.cloud.utils.exception.CloudRuntimeException;
144145
import com.cloud.vm.VirtualMachine;
145-
import org.apache.commons.lang3.StringUtils;
146146

147147
@Component
148148
public class VolumeServiceImpl implements VolumeService {
@@ -206,8 +206,6 @@ public class VolumeServiceImpl implements VolumeService {
206206
@Inject
207207
private PassphraseDao passphraseDao;
208208

209-
private final static String SNAPSHOT_ID = "SNAPSHOT_ID";
210-
211209
public VolumeServiceImpl() {
212210
}
213211

framework/jobs/src/main/java/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java

Lines changed: 124 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,13 @@
3838
import org.apache.cloudstack.api.ApiCommandResourceType;
3939
import org.apache.cloudstack.api.ApiErrorCode;
4040
import org.apache.cloudstack.context.CallContext;
41+
import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService;
4142
import org.apache.cloudstack.engine.subsystem.api.storage.SnapshotDataFactory;
4243
import org.apache.cloudstack.engine.subsystem.api.storage.SnapshotInfo;
4344
import org.apache.cloudstack.engine.subsystem.api.storage.SnapshotService;
45+
import org.apache.cloudstack.engine.subsystem.api.storage.VolumeDataFactory;
46+
import org.apache.cloudstack.engine.subsystem.api.storage.VolumeInfo;
47+
import org.apache.cloudstack.engine.subsystem.api.storage.VolumeService;
4448
import org.apache.cloudstack.framework.config.ConfigKey;
4549
import org.apache.cloudstack.framework.config.Configurable;
4650
import org.apache.cloudstack.framework.jobs.AsyncJob;
@@ -65,7 +69,12 @@
6569
import org.apache.log4j.NDC;
6670

6771
import com.cloud.cluster.ClusterManagerListener;
72+
import com.cloud.network.Network;
73+
import com.cloud.network.dao.NetworkDao;
74+
import com.cloud.network.dao.NetworkVO;
6875
import com.cloud.storage.Snapshot;
76+
import com.cloud.storage.Volume;
77+
import com.cloud.storage.VolumeDetailVO;
6978
import com.cloud.storage.dao.SnapshotDao;
7079
import com.cloud.storage.dao.SnapshotDetailsDao;
7180
import com.cloud.storage.dao.SnapshotDetailsVO;
@@ -93,7 +102,11 @@
93102
import com.cloud.utils.db.TransactionStatus;
94103
import com.cloud.utils.exception.CloudRuntimeException;
95104
import com.cloud.utils.exception.ExceptionUtil;
105+
import com.cloud.utils.fsm.NoTransitionException;
96106
import com.cloud.utils.mgmt.JmxUtil;
107+
import com.cloud.vm.VMInstanceVO;
108+
import com.cloud.vm.VirtualMachine;
109+
import com.cloud.vm.VirtualMachineManager;
97110
import com.cloud.vm.dao.VMInstanceDao;
98111

99112
public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, ClusterManagerListener, Configurable {
@@ -148,6 +161,15 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager,
148161
@Inject
149162
private SnapshotDetailsDao _snapshotDetailsDao;
150163

164+
@Inject
165+
private VolumeDataFactory volFactory;
166+
@Inject
167+
private VirtualMachineManager virtualMachineManager;
168+
@Inject
169+
private NetworkDao networkDao;
170+
@Inject
171+
private NetworkOrchestrationService networkOrchestrationService;
172+
151173
private volatile long _executionRunNumber = 1;
152174

153175
private final ScheduledExecutorService _heartbeatScheduler = Executors.newScheduledThreadPool(1, new NamedThreadFactory("AsyncJobMgr-Heartbeat"));
@@ -1089,6 +1111,7 @@ public void doInTransactionWithoutResult(TransactionStatus status) {
10891111
if (s_logger.isDebugEnabled()) {
10901112
s_logger.debug("Cancel left-over job-" + job.getId());
10911113
}
1114+
cleanupResources(job);
10921115
job.setStatus(JobInfo.Status.FAILED);
10931116
job.setResultCode(ApiErrorCode.INTERNAL_ERROR.getHttpCode());
10941117
job.setResult("job cancelled because of management server restart or shutdown");
@@ -1101,33 +1124,115 @@ public void doInTransactionWithoutResult(TransactionStatus status) {
11011124
s_logger.debug("Purge queue item for cancelled job-" + job.getId());
11021125
}
11031126
_queueMgr.purgeAsyncJobQueueItemId(job.getId());
1104-
if (ApiCommandResourceType.Volume.toString().equals(job.getInstanceType())) {
1105-
1106-
try {
1107-
_volumeDetailsDao.removeDetail(job.getInstanceId(), "SNAPSHOT_ID");
1108-
_volsDao.remove(job.getInstanceId());
1109-
} catch (Exception e) {
1110-
s_logger.error("Unexpected exception while removing concurrent request meta data :" + e.getLocalizedMessage());
1111-
}
1112-
}
1113-
}
1114-
final List<SnapshotDetailsVO> snapshotList = _snapshotDetailsDao.findDetails(AsyncJob.Constants.MS_ID, Long.toString(msid), false);
1115-
for (final SnapshotDetailsVO snapshotDetailsVO : snapshotList) {
1116-
SnapshotInfo snapshot = snapshotFactory.getSnapshotOnPrimaryStore(snapshotDetailsVO.getResourceId());
1117-
if (snapshot == null) {
1118-
_snapshotDetailsDao.remove(snapshotDetailsVO.getId());
1119-
continue;
1120-
}
1121-
snapshotSrv.processEventOnSnapshotObject(snapshot, Snapshot.Event.OperationFailed);
1122-
_snapshotDetailsDao.removeDetail(snapshotDetailsVO.getResourceId(), AsyncJob.Constants.MS_ID);
11231127
}
1128+
cleanupFailedSnapshotsCreatedWithDefaultStrategy(msid);
11241129
}
11251130
});
11261131
} catch (Throwable e) {
11271132
s_logger.warn("Unexpected exception in cleaning up left over jobs for mamagement server node " + msid, e);
11281133
}
11291134
}
11301135

1136+
/*
1137+
Cleanup Resources in transition state and move them to appropriate state
1138+
This will allow other operation on the resource, instead of being stuck in transition state
1139+
*/
1140+
protected boolean cleanupResources(AsyncJobVO job) {
1141+
try {
1142+
ApiCommandResourceType resourceType = ApiCommandResourceType.fromString(job.getInstanceType());
1143+
if (resourceType == null) {
1144+
s_logger.warn("Unknown ResourceType. Skip Cleanup: " + job.getInstanceType());
1145+
return true;
1146+
}
1147+
switch (resourceType) {
1148+
case Volume:
1149+
return cleanupVolume(job.getInstanceId());
1150+
case VirtualMachine:
1151+
return cleanupVirtualMachine(job.getInstanceId());
1152+
case Network:
1153+
return cleanupNetwork(job.getInstanceId());
1154+
}
1155+
} catch (Exception e) {
1156+
s_logger.warn("Error while cleaning up resource: [" + job.getInstanceType().toString() + "] with Id: " + job.getInstanceId(), e);
1157+
return false;
1158+
}
1159+
return true;
1160+
}
1161+
1162+
private boolean cleanupVolume(final long volumeId) {
1163+
VolumeInfo vol = volFactory.getVolume(volumeId);
1164+
if (vol == null) {
1165+
s_logger.warn("Volume not found. Skip Cleanup. VolumeId: " + volumeId);
1166+
return true;
1167+
}
1168+
if (vol.getState().isTransitional()) {
1169+
s_logger.debug("Cleaning up volume with Id: " + volumeId);
1170+
boolean status = vol.stateTransit(Volume.Event.OperationFailed);
1171+
cleanupFailedVolumesCreatedFromSnapshots(volumeId);
1172+
return status;
1173+
}
1174+
s_logger.debug("Volume not in transition state. Skip cleanup. VolumeId: " + volumeId);
1175+
return true;
1176+
}
1177+
1178+
private boolean cleanupVirtualMachine(final long vmId) throws Exception {
1179+
VMInstanceVO vmInstanceVO = _vmInstanceDao.findById(vmId);
1180+
if (vmInstanceVO == null) {
1181+
s_logger.warn("Instance not found. Skip Cleanup. InstanceId: " + vmId);
1182+
return true;
1183+
}
1184+
if (vmInstanceVO.getState().isTransitional()) {
1185+
s_logger.debug("Cleaning up Instance with Id: " + vmId);
1186+
return virtualMachineManager.stateTransitTo(vmInstanceVO, VirtualMachine.Event.OperationFailed, vmInstanceVO.getHostId());
1187+
}
1188+
s_logger.debug("Instance not in transition state. Skip cleanup. InstanceId: " + vmId);
1189+
return true;
1190+
}
1191+
1192+
private boolean cleanupNetwork(final long networkId) throws Exception {
1193+
NetworkVO networkVO = networkDao.findById(networkId);
1194+
if (networkVO == null) {
1195+
s_logger.warn("Network not found. Skip Cleanup. NetworkId: " + networkId);
1196+
return true;
1197+
}
1198+
if (Network.State.Implementing.equals(networkVO.getState())) {
1199+
try {
1200+
s_logger.debug("Cleaning up Network with Id: " + networkId);
1201+
return networkOrchestrationService.stateTransitTo(networkVO, Network.Event.OperationFailed);
1202+
} catch (final NoTransitionException e) {
1203+
networkVO.setState(Network.State.Shutdown);
1204+
networkDao.update(networkVO.getId(), networkVO);
1205+
}
1206+
}
1207+
s_logger.debug("Network not in transition state. Skip cleanup. NetworkId: " + networkId);
1208+
return true;
1209+
}
1210+
1211+
private void cleanupFailedVolumesCreatedFromSnapshots(final long volumeId) {
1212+
try {
1213+
VolumeDetailVO volumeDetail = _volumeDetailsDao.findDetail(volumeId, VolumeService.SNAPSHOT_ID);
1214+
if (volumeDetail != null) {
1215+
_volumeDetailsDao.removeDetail(volumeId, VolumeService.SNAPSHOT_ID);
1216+
_volsDao.remove(volumeId);
1217+
}
1218+
} catch (Exception e) {
1219+
s_logger.error("Unexpected exception while removing concurrent request meta data :" + e.getLocalizedMessage());
1220+
}
1221+
}
1222+
1223+
private void cleanupFailedSnapshotsCreatedWithDefaultStrategy(final long msid) {
1224+
final List<SnapshotDetailsVO> snapshotList = _snapshotDetailsDao.findDetails(AsyncJob.Constants.MS_ID, Long.toString(msid), false);
1225+
for (final SnapshotDetailsVO snapshotDetailsVO : snapshotList) {
1226+
SnapshotInfo snapshot = snapshotFactory.getSnapshotOnPrimaryStore(snapshotDetailsVO.getResourceId());
1227+
if (snapshot == null) {
1228+
_snapshotDetailsDao.remove(snapshotDetailsVO.getId());
1229+
continue;
1230+
}
1231+
snapshotSrv.processEventOnSnapshotObject(snapshot, Snapshot.Event.OperationFailed);
1232+
_snapshotDetailsDao.removeDetail(snapshotDetailsVO.getResourceId(), AsyncJob.Constants.MS_ID);
1233+
}
1234+
}
1235+
11311236
@Override
11321237
public void onManagementNodeJoined(List<? extends ManagementServerHost> nodeList, long selfNodeId) {
11331238
}

0 commit comments

Comments
 (0)