From 2f895cdb705681b7b07302a70a07075f0d43ff01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Wed, 9 Jul 2025 18:22:18 +0200 Subject: [PATCH] :recycle: update structure of raw texts --- .pre-commit-config.yaml | 2 +- mindee/parsing/v2/inference_options.py | 17 +++- mindee/tests/product/fr/__init__.py | 0 mindee/tests/product/ind/__init__.py | 0 mindee/tests/product/us/__init__.py | 0 tests/data | 2 +- tests/v2/test_inference_response.py | 135 ++++++++++--------------- 7 files changed, 67 insertions(+), 89 deletions(-) delete mode 100644 mindee/tests/product/fr/__init__.py delete mode 100644 mindee/tests/product/ind/__init__.py delete mode 100644 mindee/tests/product/us/__init__.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 31f60dbc..e204edc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: ] - repo: https://github.com/gitleaks/gitleaks - rev: v8.18.2 + rev: v8.18.4 hooks: - id: gitleaks diff --git a/mindee/parsing/v2/inference_options.py b/mindee/parsing/v2/inference_options.py index 59d98051..151acfcc 100644 --- a/mindee/parsing/v2/inference_options.py +++ b/mindee/parsing/v2/inference_options.py @@ -1,12 +1,23 @@ -from typing import List, Optional +from typing import List from mindee.parsing.common.string_dict import StringDict +class RawText: + """Raw text extracted from the document.""" + + page: int + content: str + + def __init__(self, raw_response: StringDict): + self.page = raw_response["page"] + self.content = raw_response["content"] + + class InferenceOptions: """Optional information about the document.""" - raw_text: Optional[List[str]] + raw_texts: List[RawText] def __init__(self, raw_response: StringDict): - self.raw_text = raw_response["raw_text"] if "raw_text" in raw_response else None + self.raw_texts = [RawText(raw_text) for raw_text in raw_response["raw_texts"]] diff --git a/mindee/tests/product/fr/__init__.py b/mindee/tests/product/fr/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mindee/tests/product/ind/__init__.py b/mindee/tests/product/ind/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mindee/tests/product/us/__init__.py b/mindee/tests/product/us/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data b/tests/data index f599a960..2e278837 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit f599a960e78f4a390984c6263f387aa8cdebe0f0 +Subproject commit 2e2788376cd0dd6168f1917129588fab6089378d diff --git a/tests/v2/test_inference_response.py b/tests/v2/test_inference_response.py index 0c10219f..b41e74b2 100644 --- a/tests/v2/test_inference_response.py +++ b/tests/v2/test_inference_response.py @@ -3,7 +3,6 @@ import pytest from mindee import ClientV2, LocalResponse -from mindee.parsing.common.string_dict import StringDict from mindee.parsing.v2 import ( Inference, InferenceFile, @@ -17,90 +16,30 @@ @pytest.fixture -def inference_result_json() -> StringDict: - return { - "inference": { - "model": {"id": "test-model-id"}, - "file": {"name": "test-file-name.jpg", "alias": None}, - "result": { - "fields": { - "field_simple": {"value": "value_1"}, - "field_object": { - "fields": { - "sub_object_simple": {"value": "value_2"}, - "sub_object_list": { - "items": [ - { - "fields": { - "sub_object_list_sub_list_simple": { - "value": "value_3" - } - } - }, - { - "fields": { - "sub_object_list_sub_list_object_subobject_1": { - "value": "value_4" - }, - "sub_object_list_sub_list_object_subobject_2": { - "value": "value_5" - }, - } - }, - ] - }, - "sub_object_object": { - "fields": { - "sub_object_object_sub_object_simple": { - "value": "value_6" - }, - "sub_object_object_sub_object_object": { - "fields": { - "sub_object_object_sub_object_object_simple_1": { - "value": "value_7" - }, - "sub_object_object_sub_object_object_simple_2": { - "value": "value_8" - }, - } - }, - "sub_object_object_sub_object_list": { - "items": [ - { - "fields": { - "sub_object_object_sub_object_list_simple": { - "value": "value_9" - }, - "sub_object_object_sub_object_list_object": { - "fields": { - "sub_object_object_sub_object_list_object_subobject_1": { - "value": "value_10" - }, - "sub_object_object_sub_object_list_object_subobject_2": { - "value": "value_11" - }, - } - }, - } - } - ] - }, - } - }, - } - }, - }, - "options": { - "raw_text": ["toto", "tata", "titi"], - }, - }, - } - } +def deep_nested_fields() -> dict: + with (V2_DATA_DIR / "inference/deep_nested_fields.json").open( + "r", encoding="utf-8" + ) as fh: + return json.load(fh) + + +@pytest.fixture +def standard_field_types() -> dict: + with (V2_DATA_DIR / "inference/standard_field_types.json").open( + "r", encoding="utf-8" + ) as fh: + return json.load(fh) + + +@pytest.fixture +def raw_texts() -> dict: + with (V2_DATA_DIR / "inference/raw_texts.json").open("r", encoding="utf-8") as fh: + return json.load(fh) @pytest.mark.v2 -def test_inference_response(inference_result_json): - inference_result = InferenceResponse(inference_result_json) +def test_deep_nested_fields(deep_nested_fields): + inference_result = InferenceResponse(deep_nested_fields) assert isinstance(inference_result.inference, Inference) assert isinstance( inference_result.inference.result.fields.field_simple, SimpleField @@ -166,9 +105,37 @@ def test_inference_response(inference_result_json): == "value_9" ) + +@pytest.mark.v2 +def test_deep_nested_fields(standard_field_types): + inference_result = InferenceResponse(standard_field_types) + assert isinstance(inference_result.inference, Inference) + assert isinstance( + inference_result.inference.result.fields.field_simple, SimpleField + ) + assert isinstance( + inference_result.inference.result.fields.field_object, ObjectField + ) + assert isinstance( + inference_result.inference.result.fields.field_simple_list, ListField + ) + assert isinstance( + inference_result.inference.result.fields.field_object_list, ListField + ) + + +@pytest.mark.v2 +def test_raw_texts(raw_texts): + inference_result = InferenceResponse(raw_texts) + assert isinstance(inference_result.inference, Inference) + assert inference_result.inference.result.options - assert len(inference_result.inference.result.options.raw_text) == 3 - assert inference_result.inference.result.options.raw_text[0] == "toto" + assert len(inference_result.inference.result.options.raw_texts) == 2 + assert inference_result.inference.result.options.raw_texts[0].page == 0 + assert ( + inference_result.inference.result.options.raw_texts[0].content + == "This is the raw text of the first page..." + ) @pytest.mark.v2