From 227a1a95060706c9c589328285cc5557ad04717f Mon Sep 17 00:00:00 2001 From: Christopher Brown Date: Mon, 9 Jan 2023 14:39:08 -0800 Subject: [PATCH 1/3] do not fail workflow task if completing it errors It's possible for a workflow task to fail to complete even during normal operation. For example, if a signal is added to the workflow history concurrently with a workflow task attempting to complete the workflow then the `RespondWorkflowTaskCompleted` call will recieve an `InvalidArgument` error with an `UnhandledCommand` cause. If this happens then the Ruby SDK would get this error and then try and fail the workflow task due to how the `rescue` block encompassed the completion function. This would then lead to the `RespondWorkflowTaskFailed` call failing because the server doesn't recognize the task anymore. This is because the task was actually finalized when the original `RespondWorkflowTaskCompleted` call was made and so it should not be interacted with anymore. --- lib/temporal/workflow/task_processor.rb | 4 +++ .../temporal/workflow/task_processor_spec.rb | 27 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/lib/temporal/workflow/task_processor.rb b/lib/temporal/workflow/task_processor.rb index 56057f5d..33e65dc8 100644 --- a/lib/temporal/workflow/task_processor.rb +++ b/lib/temporal/workflow/task_processor.rb @@ -139,6 +139,10 @@ def complete_task(commands, query_results) binary_checksum: binary_checksum, query_results: query_results ) + rescue StandardError => error + Temporal.logger.error("Unable to complete the workflow task", metadata.to_h.merge(error: error.inspect)) + + Temporal::ErrorHandler.handle(error, config, metadata: metadata) end def complete_query(result) diff --git a/spec/unit/lib/temporal/workflow/task_processor_spec.rb b/spec/unit/lib/temporal/workflow/task_processor_spec.rb index 00d1d745..8c096584 100644 --- a/spec/unit/lib/temporal/workflow/task_processor_spec.rb +++ b/spec/unit/lib/temporal/workflow/task_processor_spec.rb @@ -205,6 +205,33 @@ end end + context 'when recording the workflow task complete fails' do + let(:exception) { StandardError.new('workflow task could not be completed') } + + before { allow(connection).to receive(:respond_workflow_task_completed).and_raise(exception) } + + it 'does not try to fail the workflow task' do + subject.process + + expect(connection).to_not have_received(:respond_workflow_task_failed) + end + + it 'calls error_handlers' do + reported_error = nil + reported_metadata = nil + + config.on_error do |error, metadata: nil| + reported_error = error + reported_metadata = metadata + end + + subject.process + + expect(reported_error).to be_an_instance_of(StandardError) + expect(reported_metadata).to be_an_instance_of(Temporal::Metadata::WorkflowTask) + end + end + context 'when workflow task raises an exception' do let(:exception) { StandardError.new('workflow task failed') } From 90c25b8ed62a7ba540cc0888df00ee521b104758 Mon Sep 17 00:00:00 2001 From: Christopher Brown Date: Wed, 11 Jan 2023 11:59:58 -0800 Subject: [PATCH 2/3] add a comment --- lib/temporal/workflow/task_processor.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/temporal/workflow/task_processor.rb b/lib/temporal/workflow/task_processor.rb index 33e65dc8..93c349ef 100644 --- a/lib/temporal/workflow/task_processor.rb +++ b/lib/temporal/workflow/task_processor.rb @@ -140,6 +140,15 @@ def complete_task(commands, query_results) query_results: query_results ) rescue StandardError => error + # We rescue the error here to avoid failing the task in the process + # function above. One common cause of errors here is if the current + # workflow task is invalidated by a concurrent signal arriving while it + # tries to complete the workflow. In this case we do not need to and + # should not fail the workflow task. + # + # Not failing the workflow task will still result it being retried after + # a delay which is the behavior we'd want in cases like the above but + # also for ephemeral issues like network outages. Temporal.logger.error("Unable to complete the workflow task", metadata.to_h.merge(error: error.inspect)) Temporal::ErrorHandler.handle(error, config, metadata: metadata) From bcd9038e44030afbe924bff02a47b715b3ee5adc Mon Sep 17 00:00:00 2001 From: Christopher Brown Date: Wed, 11 Jan 2023 12:03:24 -0800 Subject: [PATCH 3/3] use more realistic error type in tests --- spec/unit/lib/temporal/workflow/task_processor_spec.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/unit/lib/temporal/workflow/task_processor_spec.rb b/spec/unit/lib/temporal/workflow/task_processor_spec.rb index 8c096584..ae14d2b0 100644 --- a/spec/unit/lib/temporal/workflow/task_processor_spec.rb +++ b/spec/unit/lib/temporal/workflow/task_processor_spec.rb @@ -206,7 +206,7 @@ end context 'when recording the workflow task complete fails' do - let(:exception) { StandardError.new('workflow task could not be completed') } + let(:exception) { GRPC::InvalidArgument.new('workflow task could not be completed') } before { allow(connection).to receive(:respond_workflow_task_completed).and_raise(exception) } @@ -227,7 +227,7 @@ subject.process - expect(reported_error).to be_an_instance_of(StandardError) + expect(reported_error).to be_an_instance_of(GRPC::InvalidArgument) expect(reported_metadata).to be_an_instance_of(Temporal::Metadata::WorkflowTask) end end