From 48f5bcc261663bc99077e41403474690d12de2b0 Mon Sep 17 00:00:00 2001 From: Piotr Konopka Date: Tue, 5 Aug 2025 14:36:48 +0200 Subject: [PATCH] Ensure that ODC Stop is called at FLP/QC-initiated GO_ERROR This allows us to stop a run on EPNs at GO_ERROR transition by adding a corresponding ODC.EnsureStop hook. As GO_ERROR can occur with any source state, we make sure to make the actual STOP call only if the ODC partition is in RUNNING. At the same time, ODC partitions require us to call ODC.Stop if they voluntarily transition to ERROR. In such case, ODC.Stop allows the remaining healthy devices to finish processing. By keeping the original ODC.Stop behaviour, we preserve this functionality. Additionally, the commit includes minor corrections to a few related logs. Fixes OCTRL-1036. --- core/integration/odc/plugin.go | 73 +++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/core/integration/odc/plugin.go b/core/integration/odc/plugin.go index b1550e95..b4ff8f3b 100644 --- a/core/integration/odc/plugin.go +++ b/core/integration/odc/plugin.go @@ -1437,12 +1437,13 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) { rn, ok := varStack["run_number"] if !ok { log.WithField("partition", envId). - WithField("call", "Start"). + WithField("call", "Stop"). Warn("cannot acquire run number for ODC Stop") } runNumberu64, err = strconv.ParseUint(rn, 10, 32) if err != nil { log.WithField("partition", envId). + WithField("call", "Stop"). WithError(err). Error("cannot acquire run number for ODC EOR") runNumberu64 = 0 @@ -1450,7 +1451,7 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) { runEndTimeMs, ok := varStack["run_end_time_ms"] if !ok { log.WithField("partition", envId). - WithField("call", "Start"). + WithField("call", "Stop"). Warn("cannot acquire run_end_time_ms") } @@ -1473,6 +1474,74 @@ func (p *Plugin) CallStack(data interface{}) (stack map[string]interface{}) { } return } + stack["EnsureStop"] = func() (out string) { + // ODC Stop + callFailedStr := "EPN EnsureStop call failed" + var ( + runNumberu64 uint64 + err error + ) + + timeout := callable.AcquireTimeout(ODC_STOP_TIMEOUT, varStack, "EnsureStop", envId) + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + state, err := handleGetState(ctx, p.odcClient, envId) + if err != nil { + log.WithError(err). + WithField("level", infologger.IL_Support). + WithField("partition", envId). + WithField("call", "EnsureStop"). + Error("ODC error") + call.VarStack["__call_error_reason"] = err.Error() + call.VarStack["__call_error"] = callFailedStr + return + } + if state != "RUNNING" { + log.WithField("level", infologger.IL_Devel). + WithField("partition", envId). + WithField("call", "EnsureStop"). + Infof("ODC EnsureStop attempted, while ODC partition is not in 'RUNNING' but '%s', skipping", state) + return + } + + rn, ok := varStack["run_number"] + if !ok { + log.WithField("partition", envId). + WithField("call", "EnsureStop"). + Warn("cannot acquire run number for ODC EnsureStop") + } + runNumberu64, err = strconv.ParseUint(rn, 10, 32) + if err != nil { + log.WithField("partition", envId). + WithField("call", "EnsureStop"). + WithError(err). + Error("cannot acquire run number for ODC EOR") + runNumberu64 = 0 + } + runEndTimeMs, ok := varStack["run_end_time_ms"] + if !ok { + log.WithField("partition", envId). + WithField("call", "EnsureStop"). + Warn("cannot acquire run_end_time_ms") + } + + arguments := make(map[string]string) + arguments["run_end_time_ms"] = runEndTimeMs + + err = handleStop(ctx, p.odcClient, arguments, paddingTimeout, envId, runNumberu64, call) + if err != nil { + log.WithError(err). + WithField("level", infologger.IL_Support). + WithField("partition", envId). + WithField("call", "EnsureStop"). + Error("ODC error") + call.VarStack["__call_error_reason"] = err.Error() + call.VarStack["__call_error"] = callFailedStr + } + return + } stack["EnsureCleanup"] = func() (out string) { // ODC Shutdown for current env + all orphans