Merge pull request #105 from CatchTheTornado/feature/98-universal-text-object

choinek · web-flow · commit 3cce141ae2ed · 2025-01-20T23:22:10.000+01:00
Feature/98 universal text object
diff --git a/.env.localhost.example b/.env.localhost.example
@@ -1,6 +1,7 @@
 #APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
 REDIS_CACHE_URL=redis://localhost:6379/1
 LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
+DISABLE_LOCAL_OLLAMA=0
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
diff --git a/Makefile b/Makefile
@@ -3,6 +3,13 @@ SHELL := /bin/bash
 export DISABLE_VENV ?= 0
 export DISABLE_LOCAL_OLLAMA ?= 0
 
+define load_env
+	@if [ -f $(1) ]; then \
+		echo "Loading environment from $(1)"; \
+		set -o allexport; source $(1); set +o allexport; \
+	fi
+endef
+
 .PHONY: help
 help:
 	@echo "Available commands:"
@@ -81,6 +88,7 @@ install-requirements:
 
 .PHONY: run
 run:
+	@$(call load_env,.env.localhost)
 	@echo "Starting the local application server..."; \
 	DISABLE_VENV=$(DISABLE_VENV) DISABLE_LOCAL_OLLAMA=$(DISABLE_LOCAL_OLLAMA) ./run.sh
 
diff --git a/README.md b/README.md
@@ -182,6 +182,14 @@ The models and weights of MiniCPM are completely free for academic research. aft
 
 Enabled by default. Please do use the `strategy=minicpm_v` CLI and URL parameters to use it.
 
+| ⚠️ **Remember to pull the model in Ollama first**       |
+|---------------------------------------------------------|
+| You need to pull the model in Ollama - use the command: |
+| `python client/cli.py llm_pull --model minicpm-v`       |
+| Or, if you have Ollama locally: `ollama pull minicpm-v` |
+
+
+
 ### `llama_vision` 
 
 LLama 3.2 Vision Strategy is licensed on [Meta Community License Agreement](https://ollama.com/library/llama3.2-vision/blobs/0b4284c1f870). Works great for many languages, although due to the number of parameters (90b) this model is probably **the slowest** one.
@@ -473,7 +481,7 @@ curl -X POST "http://localhost:8000/ocr/request" -H "Content-Type: application/j
   "prompt": "",
   "model": "llama3.1",
   "storage_profile": "default",
-  "storage_filename": "example.pdf"
+  "storage_filename": "example.md"
 }'
 ```
 
diff --git a/text_extract_api/extract/extract_result.py b/text_extract_api/extract/extract_result.py
@@ -0,0 +1,90 @@
+from typing import Callable, Any
+
+"""
+IMPORTANT INFORMATION ABOUT THIS CLASS:
+
+This is not the final version of the object, namespace, or intended use. 
+
+For this reason, I am not creating an interface, etc. Add code here as soon as possible 
+along with further integrations, and once we have gained sufficient experience, we will 
+undertake a refactor.
+
+Currently, the object's purpose is to replace the use of a primitive type, a string, for 
+extract returns. The limitation of this approach became evident when returning only the 
+resulting string caused us to lose valuable metadata about the document. Thanks to this 
+class, we retain DoclingDocument and foresee that other converters/OCRs may have similar 
+metadata.
+"""
+class ExtractResult:
+    def __init__(
+        self,
+        value: Any,
+        text_gatherer: Callable[[Any], str] = None
+    ):
+        """
+        Initializes a UnifiedText instance.
+
+        Args:
+            value (Any): The object containing or representing the text.
+            text_gatherer (Callable[[Any], str], optional): A callable that extracts text
+                from the `data`. Defaults to the `_default_text_gatherer`.
+
+        Raises:
+            ValueError: If `text_gatherer` is not callable or not provided when `value` is not a string.
+
+        Examples:
+            Using the default text gatherer
+
+            >>> unified = ExtractResult("Example text")
+            >>> print(unified.text())
+            Example text
+
+            Using a custom text gatherer
+
+            >>> def custom_gatherer(value): return f"Custom: {value}"
+            >>> unified = ExtractResult(123, custom_gatherer)
+            >>> print(unified.text())
+            Custom: 123
+        """
+
+        if text_gatherer is not None and not callable(text_gatherer):
+            raise ValueError("The `text_gatherer` provided to UnifiedText must be a callable.")
+
+        if not isinstance(value, str) and not callable(text_gatherer):
+            raise ValueError("If `value` is not a string, `text_gatherer` must be provided.")
+
+        self.value = value
+        self.text_gatherer = text_gatherer or self._default_text_gatherer
+
+    @staticmethod
+    def from_text(value: str) -> 'ExtractResult':
+        return ExtractResult(value)
+
+    @property
+    def text(self) -> str:
+        """
+        Retrieves text using the text gatherer.
+
+        Returns:
+            str: The extracted text from `value`.
+        """
+        return self.text_gatherer(self.value)
+
+    @staticmethod
+    def _default_text_gatherer(value: Any) -> str:
+        """
+        Default method to extract str from str.
+        So it just return value, obviously.
+
+        Args:
+            value (Any): The input value.
+
+        Returns:
+            str: The text representation of the input value.
+
+        Raises:
+            TypeError: If the `value` is not a string.
+        """
+        if isinstance(value, str):
+            return value
+        raise TypeError("Default text gatherer only supports strings.")
diff --git a/text_extract_api/extract/strategies/easyocr.py b/text_extract_api/extract/strategies/easyocr.py
@@ -3,6 +3,7 @@
 from PIL import Image
 import easyocr
 
+from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats.file_format import FileFormat
 from text_extract_api.files.file_formats.image import ImageFileFormat
@@ -13,7 +14,7 @@ class EasyOCRStrategy(Strategy):
     def name(cls) -> str:
         return "easyOCR"
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
         """
         Extract text using EasyOCR after converting the input file to images
         (if not already an ImageFileFormat). 
@@ -53,4 +54,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
 
         # Join text from all images/pages
         full_text = "\n\n".join(all_extracted_text)
-        return full_text
+
+
+        return ExtractResult.from_text(full_text)
diff --git a/text_extract_api/extract/strategies/llama_vision.py b/text_extract_api/extract/strategies/llama_vision.py
@@ -4,6 +4,7 @@
 
 import ollama
 
+from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats.file_format import FileFormat
 from text_extract_api.files.file_formats.image import ImageFileFormat
@@ -16,7 +17,7 @@ class LlamaVisionStrategy(Strategy):
     def name(cls) -> str:
         return "llama_vision"
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
 
         if (
                 not isinstance(file_format, ImageFileFormat)
@@ -66,4 +67,4 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
 
             print(response)
 
-        return extracted_text
+        return ExtractResult.from_text(extracted_text)
diff --git a/text_extract_api/extract/strategies/minicpm_v.py b/text_extract_api/extract/strategies/minicpm_v.py
@@ -4,6 +4,7 @@
 
 import ollama
 
+from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats.file_format import FileFormat
 from text_extract_api.files.file_formats.image import ImageFileFormat
@@ -16,7 +17,7 @@ class MiniCPMVStrategy(Strategy):
     def name(cls) -> str:
         return "minicpm_v"
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
 
         if (
                 not isinstance(file_format, ImageFileFormat)
@@ -66,4 +67,4 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
 
             print(response)
 
-        return extracted_text
+        return ExtractResult.from_text(extracted_text)
diff --git a/text_extract_api/extract/strategies/strategy.py b/text_extract_api/extract/strategies/strategy.py
@@ -7,6 +7,7 @@
 
 from pydantic.v1.typing import get_class
 
+from extract.extract_result import ExtractResult
 from text_extract_api.files.file_formats.file_format import FileFormat
 
 class Strategy:
@@ -27,7 +28,7 @@ def name(cls) -> str:
         raise NotImplementedError("Strategy subclasses must implement name")
 
     @classmethod
-    def extract_text(cls, file_format: Type["FileFormat"], language: str = 'en') -> str:
+    def extract_text(cls, file_format: Type["FileFormat"], language: str = 'en') -> ExtractResult:
         raise NotImplementedError("Strategy subclasses must implement extract_text method")
 
     @classmethod
diff --git a/text_extract_api/extract/tasks.py b/text_extract_api/extract/tasks.py
@@ -48,11 +48,13 @@ def ocr_task(
             extracted_text = cached_result.decode('utf-8')
 
     if extracted_text is None:
-        print("Extracting text from PDF...")
+        print(f"Extracting text from file using strategy: {strategy.name()}")
         self.update_state(state='PROGRESS',
-                          meta={'progress': 30, 'status': 'Extracting text from PDF', 'start_time': start_time,
+                          meta={'progress': 30, 'status': 'Extracting text from file', 'start_time': start_time,
                                 'elapsed_time': time.time() - start_time})  # Example progress update
-        extracted_text = strategy.extract_text(FileFormat.from_binary(binary_content), language)
+        extract_result = strategy.extract_text(FileFormat.from_binary(binary_content), language)
+        extracted_text = extract_result.text
+
     else:
         print("Using cached result...")
 
@@ -62,11 +64,12 @@ def ocr_task(
                             'start_time': start_time,
                             'elapsed_time': time.time() - start_time})  # Example progress update
 
+    # @todo Universal Text Object - is cache available
     if ocr_cache:
         redis_client.set(file_hash, extracted_text)
 
     if prompt:
-        print("Transforming text using LLM (prompt={prompt}, model={model}) ...")
+        print(f"Transforming text using LLM (prompt={prompt}, model={model}) ...")
         self.update_state(state='PROGRESS', meta={'progress': 75, 'status': 'Processing LLM', 'start_time': start_time,
                                                   'elapsed_time': time.time() - start_time})  # Example progress update
         llm_resp = ollama.generate(model, prompt + extracted_text, stream=True)
diff --git a/text_extract_api/files/file_formats/file_format.py b/text_extract_api/files/file_formats/file_format.py
@@ -68,7 +68,6 @@ def from_binary(
         mime_type = mime_type or FileFormat._guess_mime_type(binary_data=binary, filename=filename)
         from text_extract_api.files.file_formats.pdf import PdfFileFormat  # type: ignore
         file_format_class = cls._get_file_format_class(mime_type)
-        print(file_format_class)
         return file_format_class(binary_file_content=binary, filename=filename, mime_type=mime_type)
 
     def __repr__(self) -> str:
diff --git a/text_extract_api/main.py b/text_extract_api/main.py
@@ -58,11 +58,12 @@ async def ocr_endpoint(
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
 
+    filename = storage_filename if storage_filename else file.filename
     file_binary = await file.read()
-    file_format = FileFormat.from_binary(file_binary)
+    file_format = FileFormat.from_binary(file_binary, filename, file.content_type)
 
     print(
-        f"Processing Document {file_format.filename} with strategy: {strategy}, ocr_cache: {ocr_cache}, model: {model}, storage_profile: {storage_profile}, storage_filename: {storage_filename}, language: {language}")
+        f"Processing Document {file_format.filename} with strategy: {strategy}, ocr_cache: {ocr_cache}, model: {model}, storage_profile: {storage_profile}, storage_filename: {storage_filename}, language: {language}, will be saved as: {filename}")
 
     # Asynchronous processing using Celery
     task = ocr_task.apply_async(
@@ -153,7 +154,7 @@ async def ocr_request_endpoint(request: OcrRequest):
     request_data = request.model_dump()
     try:
         OcrRequest(**request_data)
-        file = FileFormat.from_base64(request.file)
+        file = FileFormat.from_base64(request.file, request.storage_filename)
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))