Skip to content

Commit a8d7d72

Browse files
committed
sandbox agent: minor fix to graceful shutdown
1 parent c32c6bd commit a8d7d72

File tree

3 files changed

+25
-23
lines changed

3 files changed

+25
-23
lines changed

SandboxAgent/deployment.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -113,43 +113,47 @@ def get_all_children_pids(self):
113113
return children_pids
114114

115115
def check_child_process(self):
116+
log_filepath = None
116117
pid, status = os.waitpid(-1, os.WNOHANG|os.WUNTRACED|os.WCONTINUED)
117-
failed_process_name = ""
118+
if pid == 0:
119+
return True, pid, "Unknown process", log_filepath
120+
121+
stopped_process_name = ""
118122
if os.WIFCONTINUED(status) or os.WIFSTOPPED(status):
119123
return False, _
120124
if os.WIFSIGNALED(status) or os.WIFEXITED(status):
121125
self._logger.error("Process with pid: " + str(pid) + " stopped.")
122126
if pid == self._fluentbit_actual_pid:
123-
failed_process_name = "Fluent-bit"
127+
stopped_process_name = "Fluent-bit"
124128
log_filepath = "/opt/mfn/LoggingService/fluent-bit/fluent-bit.log"
125129
elif pid == self._queue_service_process.pid:
126-
failed_process_name = "Queue service"
130+
stopped_process_name = "Queue service"
127131
log_filepath = None
128132
elif pid == self._frontend_process.pid:
129-
failed_process_name = "Frontend"
133+
stopped_process_name = "Frontend"
130134
log_filepath = "/opt/mfn/logs/frontend.log"
131135
else:
132136
for jrhp in self._javarequesthandler_process_list:
133137
if pid == jrhp.pid:
134-
failed_process_name = "Java request handler"
138+
stopped_process_name = "Java request handler"
135139
log_filepath = "/opt/mfn/logs/javaworker.log"
136140
break
137141
for state_name in self._functionworker_process_map:
138142
process = self._functionworker_process_map[state_name]
139143
if pid == process.pid:
140-
failed_process_name = "Function worker (" + state_name + ")"
144+
stopped_process_name = "Function worker (" + state_name + ")"
141145
log_filepath = "/opt/mfn/logs/function_" + state_name + ".log"
142146
del self._functionworker_process_map[state_name]
143147
break
144148

145-
self._logger.error("Failed process name: " + failed_process_name)
149+
self._logger.error("Stopped process name: " + stopped_process_name)
146150

147151
if os.path.exists('/var/run/secrets/kubernetes.io'):
148-
return True, pid, failed_process_name, log_filepath
152+
return True, pid, stopped_process_name, log_filepath
149153
else:
150154
# TODO: try to relaunch some of the processes (FWs, fluentbit, frontend)
151155
self._logger.info(self._child_process_command_args_map[pid])
152-
return True, pid, failed_process_name, log_filepath
156+
return True, pid, stopped_process_name, log_filepath
153157

154158
def shutdown(self):
155159
shutdown_message = {}

SandboxAgent/process_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def run_command(command, logger, custom_env=None, wait_output=False, process_log
7373

7474
def run_command_return_output(cmd, logger):
7575
error = None
76-
output = None
76+
output = ""
7777
try:
7878
args = shlex.split(cmd)
7979
child = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
@@ -82,7 +82,7 @@ def run_command_return_output(cmd, logger):
8282
if child.returncode != 0:
8383
error = child_stderr_bytes.decode().strip()
8484
except Exception as exc:
85-
logger.error('[SandboxAgent] Could not execute command and return output: %s', str(exc))
85+
logger.error('[SandboxAgent] Could not execute command and return output: %s, %s', cmd, str(exc))
8686
error = exc
8787

8888
return output.strip(), error

SandboxAgent/sandboxagent.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,6 @@ def _process_deployment_info(self):
200200
else:
201201
self._management_endpoints = json.loads(self._management_endpoints)
202202

203-
204-
205203
if not has_error:
206204
self._logger.info("External endpoint: %s", self._external_endpoint)
207205
self._logger.info("Internal endpoint: %s", self._internal_endpoint)
@@ -224,30 +222,30 @@ def sigterm(self, signum, _):
224222
raise KeyboardInterrupt
225223

226224
def sigchld(self, signum, _):
227-
should_shutdown, pid, failed_process_name, log_filepath = self._deployment.check_child_process()
225+
if self._shutting_down:
226+
return
227+
228+
should_shutdown, pid, stopped_process_name, log_filepath = self._deployment.check_child_process()
228229

229230
if should_shutdown:
230-
self._update_deployment_status(True, "A sandbox process stopped unexpectedly: " + failed_process_name, log_filepath)
231+
self._update_deployment_status(True, "A sandbox process stopped unexpectedly: " + stopped_process_name, log_filepath)
231232

232233
if pid == self._queue_service_process.pid:
233234
self._queue_service_process = None
234235
elif pid == self._frontend_process.pid:
235236
self._frontend_process = None
236-
elif failed_process_name == "Fluent-bit":
237+
elif stopped_process_name == "Fluent-bit":
237238
self._fluentbit_process = None
238239

239-
self.shutdown(reason="Process " + failed_process_name + " with pid: " + str(pid) + " stopped unexpectedly.")
240+
self.shutdown(reason="Process " + stopped_process_name + " with pid: " + str(pid) + " stopped unexpectedly.")
240241

241242
def shutdown(self, reason=None):
242-
if self._shutting_down:
243-
return
244-
245243
self._shutting_down = True
246244
errmsg = ""
247245
if reason is not None:
248246
errmsg = "Shutting down sandboxagent due to reason: " + reason + "..."
249247
self._logger.info(errmsg)
250-
# some process dies unexpectedly; need to stop as immediately as possible
248+
# some process died unexpectedly; need to stop as immediately as possible
251249
if self._fluentbit_process is not None:
252250
time.sleep(2) # flush interval of fluent-bit
253251
os._exit(1)
@@ -269,8 +267,8 @@ def shutdown(self, reason=None):
269267
self._local_queue_client.removeTopic(self._instructions_topic)
270268
self._local_queue_client.shutdown()
271269

272-
self._logger.info("Shutting down the queue service...")
273-
process_utils.terminate_and_wait_child(self._queue_service_process, "queue service", 5, self._logger)
270+
#self._logger.info("Shutting down the queue service...")
271+
#process_utils.terminate_and_wait_child(self._queue_service_process, "queue service", 5, self._logger)
274272
else:
275273
self._logger.info("No queue service; most probably it was the reason of the shutdown.")
276274
self._logger.info("Force shutting down the function worker(s)...")

0 commit comments

Comments
 (0)