Merge branch 'main' into pr110

janoberst · web-flow · commit 3bd883eb5ef7 · 2025-01-23T03:29:07.000-08:00
diff --git a/.env.example b/.env.example
@@ -2,7 +2,7 @@
 REDIS_CACHE_URL=redis://redis:6379/1
 OLLAMA_HOST=http://ollama:11434
 STORAGE_PROFILE_PATH=./storage_profiles
-LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
+REMOTE_API_URL=
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
@@ -15,3 +15,4 @@ LOAD_FILE_URL=http://localhost:8000/storage/load
 DELETE_FILE_URL=http://localhost:8000/storage/delete
 OCR_REQUEST_URL=http://localhost:8000/ocr/request
 OCR_UPLOAD_URL=http://localhost:8000/ocr/upload
+
diff --git a/.env.localhost.example b/.env.localhost.example
@@ -1,7 +1,7 @@
 #APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
 REDIS_CACHE_URL=redis://localhost:6379/1
-LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
 DISABLE_LOCAL_OLLAMA=0
+REMOTE_API_URL=
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@ The API is built with FastAPI and uses Celery for asynchronous task processing.
 ![hero doc extract](ocr-hero.webp)
 
 ## Features:
-- **No Cloud/external dependencies** all you need: PyTorch based OCR (EasyOCR) + Ollama are shipped and configured via `docker-compose`. No data is sent outside your dev/server environment.
-- **PDF/Office to Markdown** conversion with very high accuracy using different OCR strategies including [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [easyOCR](https://github.com/JaidedAI/EasyOCR), [minicpm-v](https://github.com/OpenBMB/MiniCPM-o?tab=readme-ov-file#minicpm-v-26)
+- **No Cloud/external dependencies** all you need: PyTorch based OCR (EasyOCR) + Ollama are shipped and configured via `docker-compose` no data is sent outside your dev/server environment,
+- **PDF/Office to Markdown** conversion with very high accuracy using different OCR strategies including [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [easyOCR](https://github.com/JaidedAI/EasyOCR), [minicpm-v](https://github.com/OpenBMB/MiniCPM-o?tab=readme-ov-file#minicpm-v-26), remote URL strategies including [marker-pdf](https://github.com/VikParuchuri/marker)
 - **PDF/Office to JSON** conversion using Ollama supported models (eg. LLama 3.1)
 - **LLM Improving OCR results** LLama is pretty good with fixing spelling and text issues in the OCR text
 - **Removing PII** This tool can be used for removing Personally Identifiable Information out of document - see `examples`
@@ -196,6 +196,49 @@ LLama 3.2 Vision Strategy is licensed on [Meta Community License Agreement](http
 
 Enabled by default. Please do use the `strategy=llama_vision` CLI and URL parameters to use it. It's by the way the default strategy
 
+
+### `remote`
+
+Some OCR's - like [Marker, state of the art PDF OCR](https://github.com/VikParuchuri/marker) - works really great for more than 50 languages, including great accuracy for Polish and other languages - let's say that are "diffult" to read for standard OCR.
+
+The `marker-pdf` is however licensed on GPL3 license and **therefore it's not included** by default in this application (as we're bound to MIT). 
+
+The weights for the models are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. You also must not be competitive with the Datalab API. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options here.
+
+To have it up and running you can execute the following steps:
+
+```bash
+mkdir marker-distribution # this should be outside of the `text-extract-api` folder!
+cd marker-distribution
+pip install marker-pdf
+pip install -U uvicorn fastapi python-multipart
+marker_server --port 8002
+```
+
+Set the Remote API Url:
+
+**Note: *** you might run `marker_server` on different port or server - then just make sure you export a proper env setting beffore starting off `text-extract-api` server:
+
+```bash
+export REMOTE_API_URL=http://localhost:8002/marker/upload
+```
+
+**Note: *** the URL might be also set via `/config/strategies.yaml` file
+
+Run the `text-extract-api`:
+
+```bash
+make run
+```
+
+Please do use the `strategy=remote` CLI and URL parameters to use it. For example:
+
+```bash
+curl -X POST -H "Content-Type: multipart/form-data" -F "file=@examples/example-mri.pdf" -F "strategy=remote" -F "ocr_cache=true" -F "prompt=" -F "model=" "http://localhost:8000/ocr/upload" 
+```
+
+We are connecting to remote OCR via it's API to not share the same license (GPL3) by having it all linked on the source code level.
+
 ## Getting started with Docker
 
 ### Prerequisites
@@ -444,7 +487,7 @@ apiClient.uploadFile(formData).then(response => {
 - **Method**: POST
 - **Parameters**:
   - **file**: PDF, image or Office file to be processed.
-  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v` or `easyocr`).
+  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v`, `remote` or `easyocr`). See the [available strategies](#text-extract-stratgies)
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result
   - **model**: When provided along with the prompt - this model will be used for LLM processing
@@ -463,7 +506,7 @@ curl -X POST -H "Content-Type: multipart/form-data" -F "file=@examples/example-m
 - **Method**: POST
 - **Parameters** (JSON body):
   - **file**: Base64 encoded PDF file content.
-  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v` or `easyocr`).
+  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v`, `remote` or `easyocr`). See the [available strategies](#text-extract-stratgies)
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result.
   - **model**: When provided along with the prompt - this model will be used for LLM processing.
diff --git a/config/strategies.yaml b/config/strategies.yaml
@@ -1,7 +1,14 @@
 strategies:
    llama_vision:
-      class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
+      class: text_extract_api.extract.strategies.ollama.OllamaStrategy
+      model: llama3.2-vision
+      prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
    minicpm_v:
-      class: text_extract_api.extract.strategies.minicpm_v.MiniCPMVStrategy
+      class: text_extract_api.extract.strategies.ollama.OllamaStrategy
+      model: minicpm-v
+      prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
    easyocr:
       class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
+   remote:
+      class: text_extract_api.extract.strategies.remote.RemoteStrategy
+      url:
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
@@ -18,7 +18,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
+      - REMOTE_API_URL=${REMOTE_API_URL}
     depends_on:
       - redis
       - ollama
@@ -44,7 +44,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
     depends_on:
       - redis
       - fastapi_app
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -18,7 +18,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
+      - REMOTE_API_URL=${REMOTE_API_URL}
     depends_on:
       - redis
       - ollama
@@ -39,7 +39,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}      
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
     depends_on:
       - redis
       - fastapi_app
diff --git a/text_extract_api/extract/strategies/minicpm_v.py b/text_extract_api/extract/strategies/minicpm_v.py
diff --git a/text_extract_api/extract/strategies/ollama.py b/text_extract_api/extract/strategies/ollama.py
@@ -10,8 +10,8 @@
 from text_extract_api.files.file_formats.image import ImageFileFormat
 
 
-class LlamaVisionStrategy(Strategy):
-    """Llama 3.2 Vision OCR Strategy"""
+class OllamaStrategy(Strategy):
+    """Ollama models OCR strategy"""
 
     @classmethod
     def name(cls) -> str:
@@ -24,7 +24,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                 and not file_format.can_convert_to(ImageFileFormat)
         ):
             raise TypeError(
-                f"Llama Vision - format {file_format.mime_type} is not supported (yet?)"
+                f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
             )
 
         images = FileFormat.convert_to(file_format, ImageFileFormat)
@@ -38,11 +38,12 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                 temp_file.write(image.binary)
                 temp_filename = temp_file.name
 
-            # Generate text using the Llama 3.2 Vision model
+            print(self._strategy_config)
+            # Generate text using the specified model
             try:
-                response = ollama.chat("llama3.2-vision", [{
+                response = ollama.chat(self._strategy_config.get('model'), [{
                     'role': 'user',
-                    'content': os.getenv('LLAMA_VISION_PROMPT', "You are OCR. Convert image to markdown."),
+                    'content': self._strategy_config.get('prompt'),
                     'images': [temp_filename]
                 }], stream=True)
                 os.remove(temp_filename)
@@ -63,7 +64,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                     20 / num_pages)  # 20% of work is for OCR - just a stupid assumption from tasks.py
             except ollama.ResponseError as e:
                 print('Error:', e.error)
-                raise Exception("Failed to generate text with Llama 3.2 Vision model")
+                raise Exception("Failed to generate text with Ollama model " + self._strategy_config.get('model'))
 
             print(response)
 
diff --git a/text_extract_api/extract/strategies/remote.py b/text_extract_api/extract/strategies/remote.py
@@ -0,0 +1,71 @@
+import os
+import tempfile
+import time
+
+from extract.extract_result import ExtractResult
+
+from text_extract_api.extract.strategies.strategy import Strategy
+from text_extract_api.files.file_formats.file_format import FileFormat
+from text_extract_api.files.file_formats.image import ImageFileFormat
+from text_extract_api.files.file_formats.pdf import PdfFileFormat
+import requests
+
+
+class RemoteStrategy(Strategy):
+    """Remote API Strategy"""
+
+    @classmethod
+    def name(cls) -> str:
+        return "remote"
+
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
+
+        if (
+                not isinstance(file_format, PdfFileFormat)
+                and not file_format.can_convert_to(PdfFileFormat)
+        ):
+            raise TypeError(
+                f"Marker PDF - format {file_format.mime_type} is not supported (yet?)"
+            )
+
+        pdf_files = FileFormat.convert_to(file_format, PdfFileFormat)
+        extracted_text = ""
+        start_time = time.time()
+        ocr_percent_done = 0
+        
+        if len(pdf_files) > 1:
+            raise ValueError("Only one PDF file is supported.")
+        
+        if len(pdf_files) == 0:
+            raise ValueError("No PDF file found - conversion error.")
+
+        try: 
+            url = os.getenv("REMOTE_API_URL", self._strategy_config.get("url"))
+            if not url:
+                raise Exception('Please do set the REMOTE_API_URL environment variable: export REMOTE_API_URL=http://...')
+            files = {'file': ('document.pdf', pdf_files[0].binary, 'application/pdf')}
+            data = {
+                'page_range': None,
+                'languages': language,
+                'force_ocr': False,
+                'paginate_output': False,
+                'output_format': 'markdown' # TODO: support JSON output format
+            }
+
+            meta = {
+                'progress': str(30 + ocr_percent_done),
+                'status': 'OCR Processing',
+                'start_time': start_time,
+                'elapsed_time': time.time() - start_time}
+            self.update_state_callback(state='PROGRESS', meta=meta)
+
+            response = requests.post(url, files=files, data=data)
+            if response.status_code != 200:
+                raise Exception(f"Failed to upload PDF file: {response.content}")
+
+            extracted_text = response.json().get('output', '')
+        except Exception as e:
+            print('Error:', e)
+            raise Exception("Failed to generate text with Remote API. Make sure the remote server is up and running")
+            
+        return ExtractResult.from_text(extracted_text)
diff --git a/text_extract_api/extract/strategies/strategy.py b/text_extract_api/extract/strategies/strategy.py
@@ -12,9 +12,14 @@
 
 class Strategy:
     _strategies: Dict[str, Strategy] = {}
+    _strategy_config: Dict[str, Dict] = {}
 
     def __init__(self):
         self.update_state_callback = None
+        self._strategy_config = None
+
+    def set_strategy_config(self, config: Dict):
+        self._strategy_config = config
 
     def set_update_state_callback(self, callback):
         self.update_state_callback = callback
@@ -88,8 +93,10 @@ def load_strategies_from_config(cls, path: str = os.getenv('OCR_CONFIG_PATH', 'c
             module = importlib.import_module(module_path)
 
             strategy = getattr(module, class_name)
-
-            cls.register_strategy(strategy(), strategy_name)
+            strategy_instance = strategy()
+            strategy_instance.set_strategy_config(strategy_config)
+            
+            cls.register_strategy(strategy_instance, strategy_name)
             print(f"Loaded strategy from {config_file_path} {strategy_name} [{strategy_class_path}]")
 
         return strategies
diff --git a/text_extract_api/files/converters/image_to_pdf.py b/text_extract_api/files/converters/image_to_pdf.py
@@ -0,0 +1,27 @@
+from io import BytesIO
+from typing import Iterator, Type
+from PIL import Image
+from files.converters.converter import Converter
+from files.file_formats.image import ImageFileFormat
+from files.file_formats.pdf import PdfFileFormat
+
+
+class ImageToPdfConverter(Converter):
+
+    @staticmethod
+    def convert(file_format: ImageFileFormat) -> Iterator[Type["PdfFileFormat"]]:
+
+        image = Image.open(BytesIO(file_format.binary))
+        pdf_bytes = ImageToPdfConverter._image_to_pdf_bytes(image)
+        yield PdfFileFormat.from_binary(
+            binary=pdf_bytes,
+            filename=f"{file_format.filename}.pdf",
+            mime_type="application/pdf"
+        )
+
+    @staticmethod
+    def _image_to_pdf_bytes(image: Image) -> bytes:
+
+        buffer = BytesIO()
+        image.save(buffer, format="PDF")
+        return buffer.getvalue()
diff --git a/text_extract_api/files/file_formats/image.py b/text_extract_api/files/file_formats/image.py