Merge pull request #103 from CatchTheTornado/feat/102-minicpm

pkarw · web-flow · commit 959105b482c5 · 2025-01-20T11:02:28.000+01:00
[feat] minicpm-v support
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The API is built with FastAPI and uses Celery for asynchronous task processing.
 
 ## Features:
 - **No Cloud/external dependencies** all you need: PyTorch based OCR (EasyOCR) + Ollama are shipped and configured via `docker-compose` no data is sent outside your dev/server environment,
-- **PDF/Office to Markdown** conversion with very high accuracy using different OCR strategies including [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [easyOCR](https://github.com/JaidedAI/EasyOCR)
+- **PDF/Office to Markdown** conversion with very high accuracy using different OCR strategies including [llama3.2-vision](https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/), [easyOCR](https://github.com/JaidedAI/EasyOCR), [minicpm-v](https://github.com/OpenBMB/MiniCPM-o?tab=readme-ov-file#minicpm-v-26)
 - **PDF/Office to JSON** conversion using Ollama supported models (eg. LLama 3.1)
 - **LLM Improving OCR results** LLama is pretty good with fixing spelling and text issues in the OCR text
 - **Removing PII** This tool can be used for removing Personally Identifiable Information out of document - see `examples`
@@ -162,6 +162,32 @@ python client/cli.py ocr_upload --file examples/example-mri.pdf --ocr_cache --pr
 
 In case of any questions, help requests or just feedback - please [join us on Discord](https://discord.gg/NJzu47Ye3a)!
 
+
+## Text extract stratgies
+
+### `easyocr`
+
+Easy OCR is avaialble on Apache based license. It's general purpose OCR with support for more than 30 langues, probably with the best performance for English.
+
+Enabled by default. Please do use the `strategy=easyocr` CLI and URL parameters to use it.
+
+
+### `minicpm-v` 
+
+MiniCPM-V is Apache based licenseed OCR strategy.
+
+The usage of MiniCPM-o/V model weights must strictly follow [MiniCPM Model License.md](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md).
+
+The models and weights of MiniCPM are completely free for academic research. after filling out a ["questionnaire"](https://modelbest.feishu.cn/share/base/form/shrcnpV5ZT9EJ6xYjh3Kx0J6v8g) for registration, are also available for free commercial use.
+
+Enabled by default. Please do use the `strategy=minicpm_v` CLI and URL parameters to use it.
+
+### `llama_vision` 
+
+LLama 3.2 Vision Strategy is licensed on [Meta Community License Agreement](https://ollama.com/library/llama3.2-vision/blobs/0b4284c1f870). Works great for many languages, although due to the number of parameters (90b) this model is probably **the slowest** one.
+
+Enabled by default. Please do use the `strategy=llama_vision` CLI and URL parameters to use it. It's by the way the default strategy
+
 ## Getting started with Docker
 
 ### Prerequisites
@@ -410,7 +436,7 @@ apiClient.uploadFile(formData).then(response => {
 - **Method**: POST
 - **Parameters**:
   - **file**: PDF, image or Office file to be processed.
-  - **strategy**: OCR strategy to use (`llama_vision` or `easyocr`).
+  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v` or `easyocr`).
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result
   - **model**: When provided along with the prompt - this model will be used for LLM processing
@@ -429,7 +455,7 @@ curl -X POST -H "Content-Type: multipart/form-data" -F "file=@examples/example-m
 - **Method**: POST
 - **Parameters** (JSON body):
   - **file**: Base64 encoded PDF file content.
-  - **strategy**: OCR strategy to use (`llama_vision` or `easyocr`).
+  - **strategy**: OCR strategy to use (`llama_vision`, `minicpm_v` or `easyocr`).
   - **ocr_cache**: Whether to cache the OCR result (true or false).
   - **prompt**: When provided, will be used for Ollama processing the OCR result.
   - **model**: When provided along with the prompt - this model will be used for LLM processing.
diff --git a/config/strategies.yaml b/config/strategies.yaml
@@ -1,5 +1,7 @@
 strategies:
    llama_vision:
       class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
+   minicpm_v:
+      class: text_extract_api.extract.strategies.minicpm_v.MiniCPMVStrategy
    easyocr:
       class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
@@ -24,6 +24,7 @@ else
    echo "Pulling LLM models, please wait until this process is done..."
    python client/cli.py llm_pull --model llama3.1
    python client/cli.py llm_pull --model llama3.2-vision
+   python client/cli.py llm_pull --model minicpm-v
    echo "LLM models are ready!"
 
    echo "Starting FastAPI app..."
diff --git a/text_extract_api/extract/strategies/minicpm_v.py b/text_extract_api/extract/strategies/minicpm_v.py
@@ -0,0 +1,69 @@
+import os
+import tempfile
+import time
+
+import ollama
+
+from text_extract_api.extract.strategies.strategy import Strategy
+from text_extract_api.files.file_formats.file_format import FileFormat
+from text_extract_api.files.file_formats.image import ImageFileFormat
+
+
+class MiniCPMVStrategy(Strategy):
+    """MiniCPM-V OCR Strategy"""
+
+    @classmethod
+    def name(cls) -> str:
+        return "minicpm_v"
+
+    def extract_text(self, file_format: FileFormat, language: str = 'en') -> str:
+
+        if (
+                not isinstance(file_format, ImageFileFormat)
+                and not file_format.can_convert_to(ImageFileFormat)
+        ):
+            raise TypeError(
+                f"MiniCPM-V - format {file_format.mime_type} is not supported (yet?)"
+            )
+
+        images = FileFormat.convert_to(file_format, ImageFileFormat)
+        extracted_text = ""
+        start_time = time.time()
+        ocr_percent_done = 0
+        num_pages = len(images)
+        for i, image in enumerate(images):
+
+            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
+                temp_file.write(image.binary)
+                temp_filename = temp_file.name
+
+            # Generate text using the Llama 3.2 Vision model
+            try:
+                response = ollama.chat("minicpm-v", [{
+                    'role': 'user',
+                    'content': os.getenv('MINICPMV_PROMPT', "You are OCR. Convert image to markdown."),
+                    'images': [temp_filename]
+                }], stream=True)
+                os.remove(temp_filename)
+                num_chunk = 1
+                for chunk in response:
+                    meta = {
+                        'progress': str(30 + ocr_percent_done),
+                        'status': 'OCR Processing'
+                                  + '(page ' + str(i + 1) + ' of ' + str(num_pages) + ')'
+                                  + ' chunk no: ' + str(num_chunk),
+                        'start_time': start_time,
+                        'elapsed_time': time.time() - start_time}
+                    self.update_state_callback(state='PROGRESS', meta=meta)
+                    num_chunk += 1
+                    extracted_text += chunk['message']['content']
+
+                ocr_percent_done += int(
+                    20 / num_pages)  # 20% of work is for OCR - just a stupid assumption from tasks.py
+            except ollama.ResponseError as e:
+                print('Error:', e.error)
+                raise Exception("Failed to generate text with MiniCPM-V model")
+
+            print(response)
+
+        return extracted_text