#98 Introduce ExtractResult class and modify strategy interface

choinek · choinek · commit 1fb5bf6dc8b7 · 2025-01-20T02:09:55.000+01:00
This commit introduces a new ExtractResult class which replaces the use of a primitive type (string) for extract return values. This modification preserved vital metadata about the document which was earlier lost when just the resulting string was returned. This change also involved modifying the strategy interface as well as its execution to use the new ExtractResult class rather than primitive string.
diff --git a/text_extract_api/extract/extract_result.py b/text_extract_api/extract/extract_result.py
@@ -0,0 +1,90 @@
+from typing import Callable, Any
+
+"""
+IMPORTANT INFORMATION ABOUT THIS CLASS:
+
+This is not the final version of the object, namespace, or intended use. 
+
+For this reason, I am not creating an interface, etc. Add code here as soon as possible 
+along with further integrations, and once we have gained sufficient experience, we will 
+undertake a refactor.
+
+Currently, the object's purpose is to replace the use of a primitive type, a string, for 
+extract returns. The limitation of this approach became evident when returning only the 
+resulting string caused us to lose valuable metadata about the document. Thanks to this 
+class, we retain DoclingDocument and foresee that other converters/OCRs may have similar 
+metadata.
+"""
+class ExtractResult:
+    def __init__(
+        self,
+        value: Any,
+        text_gatherer: Callable[[Any], str] = None
+    ):
+        """
+        Initializes a UnifiedText instance.
+
+        Args:
+            value (Any): The object containing or representing the text.
+            text_gatherer (Callable[[Any], str], optional): A callable that extracts text
+                from the `data`. Defaults to the `_default_text_gatherer`.
+
+        Raises:
+            ValueError: If `text_gatherer` is not callable or not provided when `value` is not a string.
+
+        Examples:
+            Using the default text gatherer
+
+            >>> unified = ExtractResult("Example text")
+            >>> print(unified.text())
+            Example text
+
+            Using a custom text gatherer
+
+            >>> def custom_gatherer(value): return f"Custom: {value}"
+            >>> unified = ExtractResult(123, custom_gatherer)
+            >>> print(unified.text())
+            Custom: 123
+        """
+
+        if text_gatherer is not None and not callable(text_gatherer):
+            raise ValueError("The `text_gatherer` provided to UnifiedText must be a callable.")
+
+        if not isinstance(value, str) and not callable(text_gatherer):
+            raise ValueError("If `value` is not a string, `text_gatherer` must be provided.")
+
+        self.value = value
+        self.text_gatherer = text_gatherer or self._default_text_gatherer
+
+    @staticmethod
+    def from_text(value: str) -> 'ExtractResult':
+        return ExtractResult(value)
+
+    @property
+    def text(self) -> str:
+        """
+        Retrieves text using the text gatherer.
+
+        Returns:
+            str: The extracted text from `value`.
+        """
+        return self.text_gatherer(self.value)
+
+    @staticmethod
+    def _default_text_gatherer(value: Any) -> str:
+        """
+        Default method to extract str from str.
+        So it just return value, obviously.
+
+        Args:
+            value (Any): The input value.
+
+        Returns:
+            str: The text representation of the input value.
+
+        Raises:
+            TypeError: If the `value` is not a string.
+        """
+        if isinstance(value, str):
+            return value
+        raise TypeError("Default text gatherer only supports strings.")
diff --git a/text_extract_api/extract/strategies/easyocr.py b/text_extract_api/extract/strategies/easyocr.py
@@ -3,6 +3,7 @@
 from PIL import Image
 import easyocr
 
+from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats.file_format import FileFormat
 from text_extract_api.files.file_formats.image import ImageFileFormat
@@ -13,7 +14,7 @@ class EasyOCRStrategy(Strategy):
     def name(cls) -> str:
         return "easyOCR"
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
         """
         Extract text using EasyOCR after converting the input file to images
         (if not already an ImageFileFormat). 
@@ -53,4 +54,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
 
         # Join text from all images/pages
         full_text = "\n\n".join(all_extracted_text)
-        return full_text
+
+
+        return ExtractResult.from_text(full_text)
diff --git a/text_extract_api/extract/strategies/llama_vision.py b/text_extract_api/extract/strategies/llama_vision.py
@@ -4,6 +4,7 @@
 
 import ollama
 
+from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats.file_format import FileFormat
 from text_extract_api.files.file_formats.image import ImageFileFormat
@@ -16,7 +17,7 @@ class LlamaVisionStrategy(Strategy):
     def name(cls) -> str:
         return "llama_vision"
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
 
         if (
                 not isinstance(file_format, ImageFileFormat)
@@ -66,4 +67,4 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
 
             print(response)
 
-        return extracted_text
+        return ExtractResult.from_text(extracted_text)
diff --git a/text_extract_api/extract/strategies/strategy.py b/text_extract_api/extract/strategies/strategy.py
@@ -7,6 +7,7 @@
 
 from pydantic.v1.typing import get_class
 
+from extract.extract_result import ExtractResult
 from text_extract_api.files.file_formats.file_format import FileFormat
 
 class Strategy:
@@ -27,7 +28,7 @@ def name(cls) -> str:
         raise NotImplementedError("Strategy subclasses must implement name")
 
     @classmethod
-    def extract_text(cls, file_format: Type["FileFormat"], language: str = 'en') -> str:
+    def extract_text(cls, file_format: Type["FileFormat"], language: str = 'en') -> ExtractResult:
         raise NotImplementedError("Strategy subclasses must implement extract_text method")
 
     @classmethod
diff --git a/text_extract_api/extract/tasks.py b/text_extract_api/extract/tasks.py
@@ -48,11 +48,13 @@ def ocr_task(
             extracted_text = cached_result.decode('utf-8')
 
     if extracted_text is None:
-        print("Extracting text from PDF...")
+        print(f"Extracting text from file using strategy: {strategy.name()}")
         self.update_state(state='PROGRESS',
-                          meta={'progress': 30, 'status': 'Extracting text from PDF', 'start_time': start_time,
+                          meta={'progress': 30, 'status': 'Extracting text from file', 'start_time': start_time,
                                 'elapsed_time': time.time() - start_time})  # Example progress update
-        extracted_text = strategy.extract_text(FileFormat.from_binary(binary_content), language)
+        extract_result = strategy.extract_text(FileFormat.from_binary(binary_content), language)
+        extracted_text = extract_result.text
+
     else:
         print("Using cached result...")
 
@@ -62,11 +64,12 @@ def ocr_task(
                             'start_time': start_time,
                             'elapsed_time': time.time() - start_time})  # Example progress update
 
+    # @todo Universal Text Object - is cache available
     if ocr_cache:
         redis_client.set(file_hash, extracted_text)
 
     if prompt:
-        print("Transforming text using LLM (prompt={prompt}, model={model}) ...")
+        print(f"Transforming text using LLM (prompt={prompt}, model={model}) ...")
         self.update_state(state='PROGRESS', meta={'progress': 75, 'status': 'Processing LLM', 'start_time': start_time,
                                                   'elapsed_time': time.time() - start_time})  # Example progress update
         llm_resp = ollama.generate(model, prompt + extracted_text, stream=True)
diff --git a/text_extract_api/files/file_formats/file_format.py b/text_extract_api/files/file_formats/file_format.py
@@ -68,7 +68,6 @@ def from_binary(
         mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
         from text_extract_api.files.file_formats.pdf import PdfFileFormat  # type: ignore
         file_format_class = cls._get_file_format_class(mime_type)
-        print(file_format_class)
         return file_format_class(binary_file_content=binary, filename=filename, mime_type=mime_type)
 
     def __repr__(self) -> str: