From 2f895cdb705681b7b07302a70a07075f0d43ff01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= <ianare@mindee.co>
Date: Wed, 9 Jul 2025 18:22:18 +0200
Subject: [PATCH] :recycle: update structure of raw texts

---
 .pre-commit-config.yaml                |   2 +-
 mindee/parsing/v2/inference_options.py |  17 +++-
 mindee/tests/product/fr/__init__.py    |   0
 mindee/tests/product/ind/__init__.py   |   0
 mindee/tests/product/us/__init__.py    |   0
 tests/data                             |   2 +-
 tests/v2/test_inference_response.py    | 135 ++++++++++---------------
 7 files changed, 67 insertions(+), 89 deletions(-)
 delete mode 100644 mindee/tests/product/fr/__init__.py
 delete mode 100644 mindee/tests/product/ind/__init__.py
 delete mode 100644 mindee/tests/product/us/__init__.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 31f60dbc..e204edc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
         ]
 
   - repo: https://github.com/gitleaks/gitleaks
-    rev: v8.18.2
+    rev: v8.18.4
     hooks:
       - id: gitleaks
 
diff --git a/mindee/parsing/v2/inference_options.py b/mindee/parsing/v2/inference_options.py
index 59d98051..151acfcc 100644
--- a/mindee/parsing/v2/inference_options.py
+++ b/mindee/parsing/v2/inference_options.py
@@ -1,12 +1,23 @@
-from typing import List, Optional
+from typing import List
 
 from mindee.parsing.common.string_dict import StringDict
 
 
+class RawText:
+    """Raw text extracted from the document."""
+
+    page: int
+    content: str
+
+    def __init__(self, raw_response: StringDict):
+        self.page = raw_response["page"]
+        self.content = raw_response["content"]
+
+
 class InferenceOptions:
     """Optional information about the document."""
 
-    raw_text: Optional[List[str]]
+    raw_texts: List[RawText]
 
     def __init__(self, raw_response: StringDict):
-        self.raw_text = raw_response["raw_text"] if "raw_text" in raw_response else None
+        self.raw_texts = [RawText(raw_text) for raw_text in raw_response["raw_texts"]]
diff --git a/mindee/tests/product/fr/__init__.py b/mindee/tests/product/fr/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/mindee/tests/product/ind/__init__.py b/mindee/tests/product/ind/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/mindee/tests/product/us/__init__.py b/mindee/tests/product/us/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/data b/tests/data
index f599a960..2e278837 160000
--- a/tests/data
+++ b/tests/data
@@ -1 +1 @@
-Subproject commit f599a960e78f4a390984c6263f387aa8cdebe0f0
+Subproject commit 2e2788376cd0dd6168f1917129588fab6089378d
diff --git a/tests/v2/test_inference_response.py b/tests/v2/test_inference_response.py
index 0c10219f..b41e74b2 100644
--- a/tests/v2/test_inference_response.py
+++ b/tests/v2/test_inference_response.py
@@ -3,7 +3,6 @@
 import pytest
 
 from mindee import ClientV2, LocalResponse
-from mindee.parsing.common.string_dict import StringDict
 from mindee.parsing.v2 import (
     Inference,
     InferenceFile,
@@ -17,90 +16,30 @@
 
 
 @pytest.fixture
-def inference_result_json() -> StringDict:
-    return {
-        "inference": {
-            "model": {"id": "test-model-id"},
-            "file": {"name": "test-file-name.jpg", "alias": None},
-            "result": {
-                "fields": {
-                    "field_simple": {"value": "value_1"},
-                    "field_object": {
-                        "fields": {
-                            "sub_object_simple": {"value": "value_2"},
-                            "sub_object_list": {
-                                "items": [
-                                    {
-                                        "fields": {
-                                            "sub_object_list_sub_list_simple": {
-                                                "value": "value_3"
-                                            }
-                                        }
-                                    },
-                                    {
-                                        "fields": {
-                                            "sub_object_list_sub_list_object_subobject_1": {
-                                                "value": "value_4"
-                                            },
-                                            "sub_object_list_sub_list_object_subobject_2": {
-                                                "value": "value_5"
-                                            },
-                                        }
-                                    },
-                                ]
-                            },
-                            "sub_object_object": {
-                                "fields": {
-                                    "sub_object_object_sub_object_simple": {
-                                        "value": "value_6"
-                                    },
-                                    "sub_object_object_sub_object_object": {
-                                        "fields": {
-                                            "sub_object_object_sub_object_object_simple_1": {
-                                                "value": "value_7"
-                                            },
-                                            "sub_object_object_sub_object_object_simple_2": {
-                                                "value": "value_8"
-                                            },
-                                        }
-                                    },
-                                    "sub_object_object_sub_object_list": {
-                                        "items": [
-                                            {
-                                                "fields": {
-                                                    "sub_object_object_sub_object_list_simple": {
-                                                        "value": "value_9"
-                                                    },
-                                                    "sub_object_object_sub_object_list_object": {
-                                                        "fields": {
-                                                            "sub_object_object_sub_object_list_object_subobject_1": {
-                                                                "value": "value_10"
-                                                            },
-                                                            "sub_object_object_sub_object_list_object_subobject_2": {
-                                                                "value": "value_11"
-                                                            },
-                                                        }
-                                                    },
-                                                }
-                                            }
-                                        ]
-                                    },
-                                }
-                            },
-                        }
-                    },
-                },
-                "options": {
-                    "raw_text": ["toto", "tata", "titi"],
-                },
-            },
-        }
-    }
+def deep_nested_fields() -> dict:
+    with (V2_DATA_DIR / "inference/deep_nested_fields.json").open(
+        "r", encoding="utf-8"
+    ) as fh:
+        return json.load(fh)
+
+
+@pytest.fixture
+def standard_field_types() -> dict:
+    with (V2_DATA_DIR / "inference/standard_field_types.json").open(
+        "r", encoding="utf-8"
+    ) as fh:
+        return json.load(fh)
+
+
+@pytest.fixture
+def raw_texts() -> dict:
+    with (V2_DATA_DIR / "inference/raw_texts.json").open("r", encoding="utf-8") as fh:
+        return json.load(fh)
 
 
 @pytest.mark.v2
-def test_inference_response(inference_result_json):
-    inference_result = InferenceResponse(inference_result_json)
+def test_deep_nested_fields(deep_nested_fields):
+    inference_result = InferenceResponse(deep_nested_fields)
     assert isinstance(inference_result.inference, Inference)
     assert isinstance(
         inference_result.inference.result.fields.field_simple, SimpleField
@@ -166,9 +105,37 @@ def test_inference_response(inference_result_json):
         == "value_9"
     )
 
+
+@pytest.mark.v2
+def test_deep_nested_fields(standard_field_types):
+    inference_result = InferenceResponse(standard_field_types)
+    assert isinstance(inference_result.inference, Inference)
+    assert isinstance(
+        inference_result.inference.result.fields.field_simple, SimpleField
+    )
+    assert isinstance(
+        inference_result.inference.result.fields.field_object, ObjectField
+    )
+    assert isinstance(
+        inference_result.inference.result.fields.field_simple_list, ListField
+    )
+    assert isinstance(
+        inference_result.inference.result.fields.field_object_list, ListField
+    )
+
+
+@pytest.mark.v2
+def test_raw_texts(raw_texts):
+    inference_result = InferenceResponse(raw_texts)
+    assert isinstance(inference_result.inference, Inference)
+
     assert inference_result.inference.result.options
-    assert len(inference_result.inference.result.options.raw_text) == 3
-    assert inference_result.inference.result.options.raw_text[0] == "toto"
+    assert len(inference_result.inference.result.options.raw_texts) == 2
+    assert inference_result.inference.result.options.raw_texts[0].page == 0
+    assert (
+        inference_result.inference.result.options.raw_texts[0].content
+        == "This is the raw text of the first page..."
+    )
 
 
 @pytest.mark.v2