CatchTheTornado
diff --git a/‎.env.example‎
Lines changed: 2 additions & 1 deletion b/‎.env.example‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.env.localhost.example‎
Lines changed: 2 additions & 1 deletion b/‎.env.localhost.example‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 8 additions & 0 deletions b/‎Makefile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 81 additions & 4 deletions b/‎README.md‎
Lines changed: 81 additions & 4 deletions
diff --git a/‎config/strategies.yaml‎
Lines changed: 12 additions & 1 deletion b/‎config/strategies.yaml‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎docker-compose.gpu.yml‎
Lines changed: 1 addition & 2 deletions b/‎docker-compose.gpu.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 1 addition & 2 deletions b/‎docker-compose.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎scripts/entrypoint.sh‎
Lines changed: 1 addition & 0 deletions b/‎scripts/entrypoint.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎text_extract_api/extract/extract_result.py‎
Lines changed: 90 additions & 0 deletions b/‎text_extract_api/extract/extract_result.py‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎text_extract_api/extract/strategies/easyocr.py‎
Lines changed: 5 additions & 2 deletions b/‎text_extract_api/extract/strategies/easyocr.py‎
Lines changed: 5 additions & 2 deletions
@@ -2,7 +2,7 @@
 REDIS_CACHE_URL=redis://redis:6379/1
 OLLAMA_HOST=http://ollama:11434
 STORAGE_PROFILE_PATH=./storage_profiles
-LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
+REMOTE_API_URL=
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
@@ -15,3 +15,4 @@ LOAD_FILE_URL=http://localhost:8000/storage/load
 DELETE_FILE_URL=http://localhost:8000/storage/delete
 OCR_REQUEST_URL=http://localhost:8000/ocr/request
 OCR_UPLOAD_URL=http://localhost:8000/ocr/upload
+
@@ -1,6 +1,7 @@
 #APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
 REDIS_CACHE_URL=redis://localhost:6379/1
-LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
+DISABLE_LOCAL_OLLAMA=0
+REMOTE_API_URL=
 
 # CLI settings
 OCR_URL=http://localhost:8000/ocr/upload
 
@@ -3,6 +3,13 @@ SHELL := /bin/bash
 export DISABLE_VENV ?= 0
 export DISABLE_LOCAL_OLLAMA ?= 0
 
+define load_env
+	@if [ -f $(1) ]; then \
+		echo "Loading environment from $(1)"; \
+		set -o allexport; source $(1); set +o allexport; \
+	fi
+endef
+
 .PHONY: help
 help:
 	@echo "Available commands:"
@@ -81,6 +88,7 @@ install-requirements:
 
 .PHONY: run
 run:
+	@$(call load_env,.env.localhost)
 	@echo "Starting the local application server..."; \
 	DISABLE_VENV=$(DISABLE_VENV) DISABLE_LOCAL_OLLAMA=$(DISABLE_LOCAL_OLLAMA) ./run.sh
 
 
@@ -8,7 +8,7 @@ The API is built with FastAPI and uses Celery for asynchronous task processing.
 
 ## Features:
 - **No Cloud/external dependencies** all you need: PyTorch based OCR (EasyOCR) + Ollama are shipped and configured via `docker-compose` no data is sent outside your dev/server environment,
-- **PDF/Office to Markdown** conversion with very high accuracy using different OCR strategies including [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [easyOCR](https://github.com/JaidedAI/EasyOCR)
+- **PDF/Office to Markdown** conversion with very high accuracy using different OCR strategies including [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [easyOCR](https://github.com/JaidedAI/EasyOCR), [minicpm-v](https://github.com/OpenBMB/MiniCPM-o?tab=readme-ov-file#minicpm-v-26), remote URL strategies including [marker-pdf](https://github.com/VikParuchuri/marker)
 - **PDF/Office to JSON** conversion using Ollama supported models (eg. LLama 3.1)
 - **LLM Improving OCR results** LLama is pretty good with fixing spelling and text issues in the OCR text
 - **Removing PII** This tool can be used for removing Personally Identifiable Information out of document - see `examples`
@@ -162,6 +162,83 @@ python client/cli.py ocr_upload --file examples/example-mri.pdf --ocr_cache --pr
 
 In case of any questions, help requests or just feedback - please [join us on Discord](https://discord.gg/NJzu47Ye3a)!
 
+
+## Text extract stratgies
+
+### `easyocr`
+
+Easy OCR is avaialble on Apache based license. It's general purpose OCR with support for more than 30 langues, probably with the best performance for English.
+
+Enabled by default. Please do use the `strategy=easyocr` CLI and URL parameters to use it.
+
+
+### `minicpm-v` 
+
+MiniCPM-V is Apache based licenseed OCR strategy.
+
+The usage of MiniCPM-o/V model weights must strictly follow [MiniCPM Model License.md](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md).
+
+The models and weights of MiniCPM are completely free for academic research. after filling out a ["questionnaire"](https://modelbest.feishu.cn/share/base/form/shrcnpV5ZT9EJ6xYjh3Kx0J6v8g) for registration, are also available for free commercial use.
+
+Enabled by default. Please do use the `strategy=minicpm_v` CLI and URL parameters to use it.
+
+| ⚠️ **Remember to pull the model in Ollama first**       |
+|---------------------------------------------------------|
+| You need to pull the model in Ollama - use the command: |
+| `python client/cli.py llm_pull --model minicpm-v`       |
+| Or, if you have Ollama locally: `ollama pull minicpm-v` |
+
+
+
+### `llama_vision` 
+
+LLama 3.2 Vision Strategy is licensed on [Meta Community License Agreement](https://ollama.com/library/llama3.2-vision/blobs/0b4284c1f870). Works great for many languages, although due to the number of parameters (90b) this model is probably **the slowest** one.
+
+Enabled by default. Please do use the `strategy=llama_vision` CLI and URL parameters to use it. It's by the way the default strategy
+
+
+### `remote`
+
+Some OCR's - like [Marker, state of the art PDF OCR](https://github.com/VikParuchuri/marker) - works really great for more than 50 languages, including great accuracy for Polish and other languages - let's say that are "diffult" to read for standard OCR.
+
+The `marker-pdf` is however licensed on GPL3 license and **therefore it's not included** by default in this application (as we're bound to MIT). 
+
+The weights for the models are licensed cc-by-nc-sa-4.0, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. You also must not be competitive with the Datalab API. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options here.
+
+To have it up and running you can execute the following steps:
+
+```bash
+mkdir marker-distribution # this should be outside of the `text-extract-api` folder!
+cd marker-distribution
+pip install marker-pdf
+pip install -U uvicorn fastapi python-multipart
+marker_server --port 8002
+```
+
+Set the Remote API Url:
+
+**Note: *** you might run `marker_server` on different port or server - then just make sure you export a proper env setting beffore starting off `text-extract-api` server:
+
+```bash
+export REMOTE_API_URL=http://localhost:8002/marker/upload
+```
+
+**Note: *** the URL might be also set via `/config/strategies.yaml` file
+
+Run the `text-extract-api`:
+
+```bash
+make run
+```
+
+Please do use the `strategy=remote` CLI and URL parameters to use it. For example:
+
+```bash
+curl -X POST -H "Content-Type: multipart/form-data" -F "file=@examples/example-mri.pdf" -F "strategy=remote" -F "ocr_cache=true" -F "prompt=" -F "model=" "http://localhost:8000/ocr/upload" 
+```
+
+We are connecting to remote OCR via it's API to not share the same license (GPL3) by having it all linked on the source code level.
+
 ## Getting started with Docker
 
 ### Prerequisites
@@ -410,7 +487,7 @@ apiClient.uploadFile(formData).then(response => {
 - **Method**: POST
 - **Parameters**:
   - **file**: PDF, image or Office file to be processed.
-  - **strategy**: OCR strategy to use (`llama_vision` or `easyocr`).
+  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v`, `remote` or `easyocr`). See the [available strategies](#text-extract-stratgies)
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result
   - **model**: When provided along with the prompt - this model will be used for LLM processing
@@ -429,7 +506,7 @@ curl -X POST -H "Content-Type: multipart/form-data" -F "file=@examples/example-m
 - **Method**: POST
 - **Parameters** (JSON body):
   - **file**: Base64 encoded PDF file content.
-  - **strategy**: OCR strategy to use (`llama_vision` or `easyocr`).
+  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v`, `remote` or `easyocr`). See the [available strategies](#text-extract-stratgies)
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result.
   - **model**: When provided along with the prompt - this model will be used for LLM processing.
@@ -447,7 +524,7 @@ curl -X POST "http://localhost:8000/ocr/request" -H "Content-Type: application/j
   "prompt": "",
   "model": "llama3.1",
   "storage_profile": "default",
-  "storage_filename": "example.pdf"
+  "storage_filename": "example.md"
 }'
 ```
 
 
@@ -1,7 +1,18 @@
 strategies:
    llama_vision:
-      class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
+      class: text_extract_api.extract.strategies.ollama.OllamaStrategy
+      model: llama3.2-vision
+      prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
+   minicpm_v:
+      class: text_extract_api.extract.strategies.ollama.OllamaStrategy
+      model: minicpm-v
+      prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
    easyocr:
       class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
    docling:
       class: text_extract_api.extract.strategies.docling.DoclingStrategy
+
+   # remote strategy example:
+   #remote:
+   #   class: text_extract_api.extract.strategies.remote.RemoteStrategy
+   #   url:
@@ -18,7 +18,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
+      - REMOTE_API_URL=${REMOTE_API_URL}
     depends_on:
       - redis
       - ollama
@@ -44,7 +44,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
     depends_on:
       - redis
       - fastapi_app
 
@@ -18,7 +18,7 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
+      - REMOTE_API_URL=${REMOTE_API_URL}
     depends_on:
       - redis
       - ollama
@@ -39,7 +39,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}      
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
     depends_on:
       - redis
       - fastapi_app
 
@@ -24,6 +24,7 @@ else
    echo "Pulling LLM models, please wait until this process is done..."
    python client/cli.py llm_pull --model llama3.1
    python client/cli.py llm_pull --model llama3.2-vision
+   python client/cli.py llm_pull --model minicpm-v
    echo "LLM models are ready!"
 
    echo "Starting FastAPI app..."
 
@@ -0,0 +1,90 @@
+from typing import Callable, Any
+
+"""
+IMPORTANT INFORMATION ABOUT THIS CLASS:
+
+This is not the final version of the object, namespace, or intended use. 
+
+For this reason, I am not creating an interface, etc. Add code here as soon as possible 
+along with further integrations, and once we have gained sufficient experience, we will 
+undertake a refactor.
+
+Currently, the object's purpose is to replace the use of a primitive type, a string, for 
+extract returns. The limitation of this approach became evident when returning only the 
+resulting string caused us to lose valuable metadata about the document. Thanks to this 
+class, we retain DoclingDocument and foresee that other converters/OCRs may have similar 
+metadata.
+"""
+class ExtractResult:
+    def __init__(
+        self,
+        value: Any,
+        text_gatherer: Callable[[Any], str] = None
+    ):
+        """
+        Initializes a UnifiedText instance.
+
+        Args:
+            value (Any): The object containing or representing the text.
+            text_gatherer (Callable[[Any], str], optional): A callable that extracts text
+                from the `data`. Defaults to the `_default_text_gatherer`.
+
+        Raises:
+            ValueError: If `text_gatherer` is not callable or not provided when `value` is not a string.
+
+        Examples:
+            Using the default text gatherer
+
+            >>> unified = ExtractResult("Example text")
+            >>> print(unified.text())
+            Example text
+
+            Using a custom text gatherer
+
+            >>> def custom_gatherer(value): return f"Custom: {value}"
+            >>> unified = ExtractResult(123, custom_gatherer)
+            >>> print(unified.text())
+            Custom: 123
+        """
+
+        if text_gatherer is not None and not callable(text_gatherer):
+            raise ValueError("The `text_gatherer` provided to UnifiedText must be a callable.")
+
+        if not isinstance(value, str) and not callable(text_gatherer):
+            raise ValueError("If `value` is not a string, `text_gatherer` must be provided.")
+
+        self.value = value
+        self.text_gatherer = text_gatherer or self._default_text_gatherer
+
+    @staticmethod
+    def from_text(value: str) -> 'ExtractResult':
+        return ExtractResult(value)
+
+    @property
+    def text(self) -> str:
+        """
+        Retrieves text using the text gatherer.
+
+        Returns:
+            str: The extracted text from `value`.
+        """
+        return self.text_gatherer(self.value)
+
+    @staticmethod
+    def _default_text_gatherer(value: Any) -> str:
+        """
+        Default method to extract str from str.
+        So it just return value, obviously.
+
+        Args:
+            value (Any): The input value.
+
+        Returns:
+            str: The text representation of the input value.
+
+        Raises:
+            TypeError: If the `value` is not a string.
+        """
+        if isinstance(value, str):
+            return value
+        raise TypeError("Default text gatherer only supports strings.")
@@ -3,6 +3,7 @@
 from PIL import Image
 import easyocr
 
+from extract.extract_result import ExtractResult
 from text_extract_api.extract.strategies.strategy import Strategy
 from text_extract_api.files.file_formats.file_format import FileFormat
 from text_extract_api.files.file_formats.image import ImageFileFormat
@@ -13,7 +14,7 @@ class EasyOCRStrategy(Strategy):
     def name(cls) -> str:
         return "easyOCR"
 
-    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> ExtractResult:
         """
         Extract text using EasyOCR after converting the input file to images
         (if not already an ImageFileFormat). 
@@ -53,4 +54,6 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
 
         # Join text from all images/pages
         full_text = "\n\n".join(all_extracted_text)
-        return full_text
+
+
+        return ExtractResult.from_text(full_text)