From bca2b0e13308ac604ecb526c5f44e27fb6eeb814 Mon Sep 17 00:00:00 2001 From: nshkrdotcom Date: Fri, 4 Jul 2025 18:42:33 -1000 Subject: [PATCH 1/2] Fix trailing wrapper text removal in Layer 1 content cleaning (fixes #1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses GitHub issue #1 where JSON followed by wrapper text was not being properly cleaned by the content cleaning layer. Changes: - Added remove_trailing_wrapper_text/1 function to detect and remove text following valid JSON structures - Updated extract_json_content_internal/2 to call the new function as a final extraction step - Added test case specifically for the reported issue with JSON arrays followed by status messages The fix ensures that when JSON content is extracted, any trailing non-JSON text (like "1 Volume(s) created") is properly removed, returning only the valid JSON structure. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- lib/json_remedy/layer1/content_cleaning.ex | 65 +++++++++++++++++++++- test/unit/layer1_content_cleaning_test.exs | 30 ++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) mode change 100644 => 100755 lib/json_remedy/layer1/content_cleaning.ex mode change 100644 => 100755 test/unit/layer1_content_cleaning_test.exs diff --git a/lib/json_remedy/layer1/content_cleaning.ex b/lib/json_remedy/layer1/content_cleaning.ex old mode 100644 new mode 100755 index b652250..4464e21 --- a/lib/json_remedy/layer1/content_cleaning.ex +++ b/lib/json_remedy/layer1/content_cleaning.ex @@ -87,7 +87,10 @@ defmodule JsonRemedy.Layer1.ContentCleaning do # Then try to extract from prose/text {result, prose_repairs} = extract_from_prose(result) - all_repairs = existing_repairs ++ html_repairs ++ prose_repairs + # Finally, remove any trailing wrapper text after JSON + {result, trailing_repairs} = remove_trailing_wrapper_text(result) + + all_repairs = existing_repairs ++ html_repairs ++ prose_repairs ++ trailing_repairs {result, all_repairs} end @@ -682,6 +685,66 @@ defmodule JsonRemedy.Layer1.ContentCleaning do find_balanced_end(rest, open, close, pos + 1, balance, in_string) end + # Remove trailing wrapper text after JSON + defp remove_trailing_wrapper_text(input) do + trimmed = String.trim(input) + + # Check if input starts with JSON structure + cond do + String.starts_with?(trimmed, "{") -> + check_and_remove_trailing_text(input, "{", "}") + + String.starts_with?(trimmed, "[") -> + check_and_remove_trailing_text(input, "[", "]") + + true -> + {input, []} + end + end + + defp check_and_remove_trailing_text(input, open_char, close_char) do + # Find where the JSON structure starts + json_start = + case String.split(input, open_char, parts: 2) do + [prefix, _] -> String.length(prefix) + _ -> 0 + end + + # Extract from the JSON start to find the balanced end + substring_from_json = String.slice(input, json_start, String.length(input)) + + case find_balanced_end(substring_from_json, open_char, close_char) do + nil -> + # Could not find balanced end, return as is + {input, []} + + end_pos -> + # Calculate the absolute position where JSON ends + json_end = json_start + end_pos + 1 + + # Check if there's non-whitespace content after JSON ends + after_json = String.slice(input, json_end, String.length(input)) + + if String.trim(after_json) == "" do + # No significant trailing content + {input, []} + else + # Extract only the JSON portion + json_content = String.slice(input, 0, json_end) + + repair = %{ + layer: :content_cleaning, + action: "removed trailing wrapper text", + position: json_end, + original: input, + replacement: json_content + } + + {json_content, [repair]} + end + end + end + # Helper functions for string detection using direct methods # Fast check for long text that likely contains JSON content diff --git a/test/unit/layer1_content_cleaning_test.exs b/test/unit/layer1_content_cleaning_test.exs old mode 100644 new mode 100755 index 360b4a0..9dc044a --- a/test/unit/layer1_content_cleaning_test.exs +++ b/test/unit/layer1_content_cleaning_test.exs @@ -174,6 +174,36 @@ defmodule JsonRemedy.Layer1.ContentCleaningTest do assert length(context.repairs) > 0 end end + + test "extracts json with trailing wrapper text (GitHub issue #1)" do + # This test case reproduces the issue where JSON followed by text is not cleaned + input = """ + [ + { + "volumeID": "f3a6ffd2-0111-4235-980c-a5ceec215e93", + "name": "km-tst-20", + "cloudID": "75b10103873d4a1ba0d52b43159a2842", + "size": 1, + "storageType": "ssd", + "state": "creating", + "shareable": false, + "bootable": false, + "volumePool": "General-Flash-002" + } + ] + 1 Volume(s) created + """ + + {:ok, result, context} = ContentCleaning.process(input, %{repairs: [], options: []}) + + # Should extract only the JSON array, removing the trailing text + trimmed_result = String.trim(result) + assert String.starts_with?(trimmed_result, "[") + assert String.ends_with?(trimmed_result, "]") + assert not String.contains?(result, "1 Volume(s) created") + assert length(context.repairs) > 0 + assert hd(context.repairs).action =~ "removed trailing wrapper text" + end end describe "encoding normalization" do From c6827c40de99cde972d9052f477905cbf8b1d5e8 Mon Sep 17 00:00:00 2001 From: nshkrdotcom Date: Fri, 4 Jul 2025 18:44:44 -1000 Subject: [PATCH 2/2] Bump version to 0.1.3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Updated version in mix.exs from 0.1.2 to 0.1.3 - Added changelog entry for the trailing wrapper text fix 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- CHANGELOG.md | 8 ++++++++ mix.exs | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) mode change 100644 => 100755 CHANGELOG.md mode change 100644 => 100755 mix.exs diff --git a/CHANGELOG.md b/CHANGELOG.md old mode 100644 new mode 100755 index e156cf8..226f6e7 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.3] - 2025-07-05 + +### Fixed +- Fixed issue where wrapper text following JSON blocks was not recognized (#1) + - Added dedicated `remove_trailing_wrapper_text/1` function in Layer 1 + - Now properly removes trailing text after valid JSON structures + - Example: `[{"id": 1}]\n1 Volume(s) created` → `[{"id": 1}]` + ## [0.1.2] - 2025-06-08 ### Added diff --git a/mix.exs b/mix.exs old mode 100644 new mode 100755 index e313907..bbb19e6 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule JsonRemedy.MixProject do use Mix.Project - @version "0.1.1" + @version "0.1.3" @source_url "https://github.com/nshkrdotcom/json_remedy" def project do