From ef857253cb30503e95760f5e5e39b89f43544088 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Tue, 26 Nov 2024 14:24:37 +0100 Subject: [PATCH] OCTRL-949 [core] Improve reaction to controlled nodes becoming unreachable Includes: - fixed copy-paste logs "received executor failed" -> "received agent failed" - added an operator log in case of connection issues to a mesos slave - allowed to re-register agent and executor IDs for a Task once they come back (they are removed when an Agent/Executor failure is received). Effectively, this allows an environment to be torn down correctly, fixing at least some of the leftover task issues (OCTRL-611). - added documentation about configuring the node-down timeout --- core/environment/manager.go | 4 ++-- core/task/manager.go | 7 +++++++ core/task/scheduler.go | 3 +++ docs/handbook/appconfiguration.md | 13 ++++++++++++- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/core/environment/manager.go b/core/environment/manager.go index 1c9f6783d..ced832149 100644 --- a/core/environment/manager.go +++ b/core/environment/manager.go @@ -116,13 +116,13 @@ func NewEnvManager(tm *task.Manager, incomingEventCh chan event.Event) *Manager WithField("partition", envId.String()). WithField("agentId", typedEvent.GetId().Value). WithError(err). - Error("cannot find environment for incoming executor failed event") + Error("cannot find environment for incoming agent failed event") } log.WithPrefix("scheduler"). WithField("partition", envId.String()). WithField("agentId", typedEvent.GetId().Value). WithField("envState", env.CurrentState()). - Debug("received executor failed event") + Debug("received agent failed event") } case *event.TasksReleasedEvent: diff --git a/core/task/manager.go b/core/task/manager.go index 640fe6484..83139c69a 100644 --- a/core/task/manager.go +++ b/core/task/manager.go @@ -1047,6 +1047,13 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) { if taskPtr.GetParent() != nil { taskPtr.GetParent().UpdateStatus(ACTIVE) } + if status.GetAgentID() != nil { + taskPtr.agentId = status.GetAgentID().GetValue() + } + if status.GetExecutorID() != nil { + taskPtr.executorId = status.GetExecutorID().GetValue() + } + case mesos.TASK_DROPPED, mesos.TASK_LOST, mesos.TASK_KILLED, mesos.TASK_FAILED, mesos.TASK_ERROR, mesos.TASK_FINISHED: taskPtr.status = INACTIVE diff --git a/core/task/scheduler.go b/core/task/scheduler.go index 53e548737..aeea651f5 100644 --- a/core/task/scheduler.go +++ b/core/task/scheduler.go @@ -246,6 +246,9 @@ func (state *schedulerState) failure(_ context.Context, e *scheduler.Event) erro WithFields(fields). WithField("level", infologger.IL_Support). Error("agent failed") + log.WithField("level", infologger.IL_Ops). + WithField("detector", detector). + Errorf("possible connectivity issues with host '%s'", host) state.taskman.internalEventCh <- event.NewAgentFailedEvent(aid) } return nil diff --git a/docs/handbook/appconfiguration.md b/docs/handbook/appconfiguration.md index 88250ac3f..5b5892771 100644 --- a/docs/handbook/appconfiguration.md +++ b/docs/handbook/appconfiguration.md @@ -1 +1,12 @@ -# Component Configuration \ No newline at end of file +# Component Configuration + +## Connectivity to controlled nodes + +ECS relies on Mesos to know the state of the controlled nodes. +Thus, losing connection to a Mesos slave can be treated as a node being down or unresponsive. +In case a Mesos slave is lost, tasks belonging to it are set to ERROR state and treated as INACTIVE. +Then, the environment is transitioned to ERROR. + +Mesos slave health check can be configured with `MESOS_MAX_AGENT_PING_TIMEOUTS` (`--max_agent_ping_timeouts`) and `MESOS_AGENT_PING_TIMEOUT` (`--agent_ping_timeout`) parameters for Mesos. +Effectively, the factor of the two parameters is the time needed to consider a slave/agent as lost. +Please refer to Mesos documentation for more details. \ No newline at end of file