From 363820efcc89d7551b79f1a764151523f3357a88 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Wed, 29 Oct 2025 09:00:21 +0100 Subject: [PATCH] Rework handling failed GO_ERROR In this commit we unify the way that a failed GO_ERROR is handled. We recognize invalid event errors as harmless (ERROR->ERROR is pointless, DONE->ERROR is too late). Any other case is very much unexpected and we print a visible error and comply with the previous behaviour - setting EROR state manually. Closes OCTRL-1064. --- core/environment/environment.go | 18 ++---------- core/environment/manager.go | 28 ++++-------------- core/environment/utils.go | 26 +++++++++++++++++ core/environment/utils_test.go | 50 +++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 39 deletions(-) create mode 100644 core/environment/utils_test.go diff --git a/core/environment/environment.go b/core/environment/environment.go index 34241d78..f7536746 100644 --- a/core/environment/environment.go +++ b/core/environment/environment.go @@ -1232,19 +1232,8 @@ func (env *Environment) subscribeToWfState(taskman *task.Manager) { NewEnvGoErrorEvent(env, newCriticalTasksErrorMessage(env)), ) err := env.TryTransition(NewGoErrorTransition(taskman)) - if err != nil { - if env.Sm.Current() == "ERROR" { - log.WithField("partition", env.id). - WithField("level", infologger.IL_Devel). - Info("skipped requested transition to ERROR: environment already in ERROR state") - } else { - log.WithField("partition", env.id). - WithError(err). - WithField("level", infologger.IL_Devel). - Warn("could not transition gently to ERROR, forcing it") - env.setState(wfState.String()) - } + handleFailedGoError(err, env) } }) break WORKFLOW_STATE_LOOP @@ -1472,10 +1461,7 @@ func (env *Environment) scheduleAutoStopTransition() (scheduled bool, expected t err = env.TryTransition(NewGoErrorTransition(ManagerInstance().taskman)) if err != nil { - log.WithField("partition", env.id). - WithField("run", env.currentRunNumber). - Errorf("Forced transition to ERROR failed: %s", err.Error()) - env.setState("ERROR") + handleFailedGoError(err, env) } return } diff --git a/core/environment/manager.go b/core/environment/manager.go index e797f257..5c732607 100644 --- a/core/environment/manager.go +++ b/core/environment/manager.go @@ -496,10 +496,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string] envs.taskman), ) if err != nil { - log.WithField("partition", env.Id().String()). - WithField("state", envState). - Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress") - env.setState("ERROR") + handleFailedGoError(err, env) } envTasks := env.Workflow().GetTasks() @@ -603,10 +600,7 @@ func (envs *Manager) CreateEnvironment(workflowPath string, userVars map[string] envs.taskman), ) if errTxErr != nil { - log.WithField("partition", env.Id().String()). - WithField("state", envState). - WithError(errTxErr). - Debug("could not transition to ERROR after failed deployment/configuration, cleanup in progress") + handleFailedGoError(errTxErr, env) } envTasks := env.Workflow().GetTasks() // TeardownEnvironment manages the envs.mu internally @@ -1064,11 +1058,7 @@ func (envs *Manager) handleIntegratedServiceEvent(evt event.IntegratedServiceEve ) err = env.TryTransition(NewGoErrorTransition(envs.taskman)) if err != nil { - log.WithPrefix("scheduler"). - WithField("partition", envId.String()). - WithError(err). - Error("environment GO_ERROR transition failed after ODC_PARTITION_STATE_CHANGE ERROR event") - env.setState("ERROR") + handleFailedGoError(err, env) } } }() @@ -1124,12 +1114,7 @@ func (envs *Manager) handleLhcEvents(evt event.IntegratedServiceEvent) { if env.CurrentState() != "ERROR" { err = env.TryTransition(NewGoErrorTransition(envs.taskman)) if err != nil { - log.WithPrefix("scheduler"). - WithField("partition", envId.String()). - WithField("run", env.currentRunNumber). - WithError(err). - Error("environment GO_ERROR transition failed after a beam dump event, forcing") - env.setState("ERROR") + handleFailedGoError(err, env) } } } @@ -1483,11 +1468,8 @@ func (envs *Manager) CreateAutoEnvironment(workflowPath string, userVars map[str envs.taskman), ) if err != nil { - log.WithField("partition", env.Id().String()). - WithField("state", envState). - Debug("could not transition failed auto-transitioning environment to ERROR, cleanup in progress") + handleFailedGoError(err, env) env.sendEnvironmentEvent(&event.EnvironmentEvent{Message: "transition ERROR failed, forcing", EnvironmentID: env.Id().String(), Error: err}) - env.setState("ERROR") } envTasks := env.Workflow().GetTasks() diff --git a/core/environment/utils.go b/core/environment/utils.go index 229fde3e..b5367d24 100644 --- a/core/environment/utils.go +++ b/core/environment/utils.go @@ -27,12 +27,14 @@ package environment import ( "bytes" "encoding/json" + "errors" "fmt" "github.com/AliceO2Group/Control/common/logger/infologger" pb "github.com/AliceO2Group/Control/common/protos" "github.com/AliceO2Group/Control/core/task" "github.com/AliceO2Group/Control/core/task/sm" "github.com/AliceO2Group/Control/core/workflow" + "github.com/looplab/fsm" "os" "sort" @@ -139,3 +141,27 @@ func newCriticalTasksErrorMessage(env *Environment) string { return fmt.Sprintf("%d critical tasks transitioned to ERROR, could not determine the first one to fail", len(criticalTasksInError)) } } + +func handleFailedGoError(err error, env *Environment) { + var invalidEventErr *fsm.InvalidEventError + if errors.As(err, &invalidEventErr) { + // this case can occur if the environment is in either: + // - ERROR (env already transitioned to ERROR for another reason) + // - DONE (an error might have occurred during teardown, but it's already over, no point in spreading panic) + log.WithError(invalidEventErr). + WithField("partition", env.Id().String()). + WithField("run", env.currentRunNumber). + WithField("state", env.CurrentState()). + WithField(infologger.Level, infologger.IL_Support). + Warn("did not perform GO_ERROR transition") + } else { + // in principle this should never happen, so we log it accordingly and force the ERROR state just in case + log.WithError(err). + WithField("partition", env.Id().String()). + WithField("run", env.currentRunNumber). + WithField("state", env.CurrentState()). + WithField(infologger.Level, infologger.IL_Ops). + Error("could not perform GO_ERROR transition due to unexpected error, forcing...") + env.setState("ERROR") + } +} diff --git a/core/environment/utils_test.go b/core/environment/utils_test.go new file mode 100644 index 00000000..24e205d3 --- /dev/null +++ b/core/environment/utils_test.go @@ -0,0 +1,50 @@ +/* + * === This file is part of ALICE O² === + * + * Copyright 2025 CERN and copyright holders of ALICE O². + * Author: Piotr Konopka + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In applying this license CERN does not waive the privileges and + * immunities granted to it by virtue of its status as an + * Intergovernmental Organization or submit itself to any jurisdiction. + */ + +package environment + +import ( + "github.com/looplab/fsm" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("handleFailedGoError", func() { + It("does not overwrite state for InvalidEventError", func() { + env := &Environment{} + env.Sm = fsm.NewFSM("DONE", fsm.Events{}, fsm.Callbacks{}) + Expect(env.Sm.Current()).To(Equal("DONE")) + + handleFailedGoError(&fsm.InvalidEventError{Event: "GO_ERROR", State: "DONE"}, env) + Expect(env.Sm.Current()).To(Equal("DONE")) + }) + + It("overwrites state to ERROR for other errors", func() { + env := &Environment{} + env.Sm = fsm.NewFSM("CONFIGURED", fsm.Events{}, fsm.Callbacks{}) + + handleFailedGoError(fsm.UnknownEventError{Event: "BOOM"}, env) + Expect(env.Sm.Current()).To(Equal("ERROR")) + }) +})