From ef857253cb30503e95760f5e5e39b89f43544088 Mon Sep 17 00:00:00 2001
From: Piotr Konopka <piotr.jan.konopka@cern.ch>
Date: Tue, 26 Nov 2024 14:24:37 +0100
Subject: [PATCH] OCTRL-949 [core] Improve reaction to controlled nodes
 becoming unreachable

Includes:
- fixed copy-paste logs "received executor failed" -> "received agent failed"
- added an operator log in case of connection issues to a mesos slave
- allowed to re-register agent and executor IDs for a Task once they come back (they are removed when an Agent/Executor failure is received). Effectively, this allows an environment to be torn down correctly, fixing at least some of the leftover task issues (OCTRL-611).
- added documentation about configuring the node-down timeout
---
 core/environment/manager.go       |  4 ++--
 core/task/manager.go              |  7 +++++++
 core/task/scheduler.go            |  3 +++
 docs/handbook/appconfiguration.md | 13 ++++++++++++-
 4 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/core/environment/manager.go b/core/environment/manager.go
index 1c9f6783d..ced832149 100644
--- a/core/environment/manager.go
+++ b/core/environment/manager.go
@@ -116,13 +116,13 @@ func NewEnvManager(tm *task.Manager, incomingEventCh chan event.Event) *Manager
 								WithField("partition", envId.String()).
 								WithField("agentId", typedEvent.GetId().Value).
 								WithError(err).
-								Error("cannot find environment for incoming executor failed event")
+								Error("cannot find environment for incoming agent failed event")
 						}
 						log.WithPrefix("scheduler").
 							WithField("partition", envId.String()).
 							WithField("agentId", typedEvent.GetId().Value).
 							WithField("envState", env.CurrentState()).
-							Debug("received executor failed event")
+							Debug("received agent failed event")
 					}
 
 				case *event.TasksReleasedEvent:
diff --git a/core/task/manager.go b/core/task/manager.go
index 640fe6484..83139c69a 100644
--- a/core/task/manager.go
+++ b/core/task/manager.go
@@ -1047,6 +1047,13 @@ func (m *Manager) updateTaskStatus(status *mesos.TaskStatus) {
 		if taskPtr.GetParent() != nil {
 			taskPtr.GetParent().UpdateStatus(ACTIVE)
 		}
+		if status.GetAgentID() != nil {
+			taskPtr.agentId = status.GetAgentID().GetValue()
+		}
+		if status.GetExecutorID() != nil {
+			taskPtr.executorId = status.GetExecutorID().GetValue()
+		}
+
 	case mesos.TASK_DROPPED, mesos.TASK_LOST, mesos.TASK_KILLED, mesos.TASK_FAILED, mesos.TASK_ERROR, mesos.TASK_FINISHED:
 
 		taskPtr.status = INACTIVE
diff --git a/core/task/scheduler.go b/core/task/scheduler.go
index 53e548737..aeea651f5 100644
--- a/core/task/scheduler.go
+++ b/core/task/scheduler.go
@@ -246,6 +246,9 @@ func (state *schedulerState) failure(_ context.Context, e *scheduler.Event) erro
 			WithFields(fields).
 			WithField("level", infologger.IL_Support).
 			Error("agent failed")
+		log.WithField("level", infologger.IL_Ops).
+			WithField("detector", detector).
+			Errorf("possible connectivity issues with host '%s'", host)
 		state.taskman.internalEventCh <- event.NewAgentFailedEvent(aid)
 	}
 	return nil
diff --git a/docs/handbook/appconfiguration.md b/docs/handbook/appconfiguration.md
index 88250ac3f..5b5892771 100644
--- a/docs/handbook/appconfiguration.md
+++ b/docs/handbook/appconfiguration.md
@@ -1 +1,12 @@
-# Component Configuration
\ No newline at end of file
+# Component Configuration
+
+## Connectivity to controlled nodes
+
+ECS relies on Mesos to know the state of the controlled nodes.
+Thus, losing connection to a Mesos slave can be treated as a node being down or unresponsive.
+In case a Mesos slave is lost, tasks belonging to it are set to ERROR state and treated as INACTIVE.
+Then, the environment is transitioned to ERROR.
+
+Mesos slave health check can be configured with `MESOS_MAX_AGENT_PING_TIMEOUTS` (`--max_agent_ping_timeouts`) and `MESOS_AGENT_PING_TIMEOUT` (`--agent_ping_timeout`) parameters for Mesos.
+Effectively, the factor of the two parameters is the time needed to consider a slave/agent as lost.
+Please refer to Mesos documentation for more details.
\ No newline at end of file