[feat]: generalized OllamaStrategy instead of MiniCPM/LlamaVision; added strategy config like prompt to the config file + added url for the RemoteStrategy so now few remote strategies could be defined within a single config with different URLs (yet the URL format is fixed for now)

pkarw · pkarw · commit b45c9fe3501d · 2025-01-21T11:58:01.000+01:00
diff --git a/.env.example b/.env.example
@@ -2,7 +2,6 @@
 REDIS_CACHE_URL=redis://redis:6379/1
 OLLAMA_HOST=http://ollama:11434
 STORAGE_PROFILE_PATH=./storage_profiles
-LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
 REMOTE_API_URL=
 
 # CLI settings
diff --git a/.env.localhost.example b/.env.localhost.example
@@ -1,6 +1,5 @@
 #APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
 REDIS_CACHE_URL=redis://localhost:6379/1
-LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
 DISABLE_LOCAL_OLLAMA=0
 REMOTE_API_URL=
 
diff --git a/README.md b/README.md
@@ -223,6 +223,8 @@ Set the Remote API Url:
 export REMOTE_API_URL=http://localhost:8002/marker/upload
 ```
 
+**Note: *** the URL might be also set via `/config/strategies.yaml` file
+
 Run the `text-extract-api`:
 
 ```bash
diff --git a/config/strategies.yaml b/config/strategies.yaml
@@ -1,9 +1,14 @@
 strategies:
    llama_vision:
-      class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
+      class: text_extract_api.extract.strategies.ollama.OllamaStrategy
+      model: llama3.2-vision
+      prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
    minicpm_v:
-      class: text_extract_api.extract.strategies.minicpm_v.MiniCPMVStrategy
+      class: text_extract_api.extract.strategies.ollama.OllamaStrategy
+      model: minicpm-v
+      prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
    easyocr:
       class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
    remote:
       class: text_extract_api.extract.strategies.remote.RemoteStrategy
+      url:
diff --git a/docker-compose.gpu.yml b/docker-compose.gpu.yml
@@ -18,7 +18,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
       - REMOTE_API_URL=${REMOTE_API_URL}
     depends_on:
       - redis
@@ -45,7 +44,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
     depends_on:
       - redis
       - fastapi_app
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -18,7 +18,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
       - REMOTE_API_URL=${REMOTE_API_URL}
     depends_on:
       - redis
@@ -40,7 +39,6 @@ services:
       - LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}      
       - LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
       - DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
-      - LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}      
     depends_on:
       - redis
       - fastapi_app
diff --git a/text_extract_api/extract/strategies/minicpm_v.py b/text_extract_api/extract/strategies/minicpm_v.py
diff --git a/text_extract_api/extract/strategies/ollama.py b/text_extract_api/extract/strategies/ollama.py
@@ -10,8 +10,8 @@
 from text_extract_api.files.file_formats.image import ImageFileFormat
 
 
-class LlamaVisionStrategy(Strategy):
-    """Llama 3.2 Vision OCR Strategy"""
+class OllamaStrategy(Strategy):
+    """Ollama models OCR strategy"""
 
     @classmethod
     def name(cls) -> str:
@@ -24,7 +24,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                 and not file_format.can_convert_to(ImageFileFormat)
         ):
             raise TypeError(
-                f"Llama Vision - format {file_format.mime_type} is not supported (yet?)"
+                f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
             )
 
         images = FileFormat.convert_to(file_format, ImageFileFormat)
@@ -38,11 +38,12 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                 temp_file.write(image.binary)
                 temp_filename = temp_file.name
 
-            # Generate text using the Llama 3.2 Vision model
+            print(self._strategy_config)
+            # Generate text using the specified model
             try:
-                response = ollama.chat("llama3.2-vision", [{
+                response = ollama.chat(self._strategy_config.get('model'), [{
                     'role': 'user',
-                    'content': os.getenv('LLAMA_VISION_PROMPT', "You are OCR. Convert image to markdown."),
+                    'content': self._strategy_config.get('prompt'),
                     'images': [temp_filename]
                 }], stream=True)
                 os.remove(temp_filename)
@@ -63,7 +64,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
                     20 / num_pages)  # 20% of work is for OCR - just a stupid assumption from tasks.py
             except ollama.ResponseError as e:
                 print('Error:', e.error)
-                raise Exception("Failed to generate text with Llama 3.2 Vision model")
+                raise Exception("Failed to generate text with Ollama model " + self._strategy_config.get('model'))
 
             print(response)
 
diff --git a/text_extract_api/extract/strategies/remote.py b/text_extract_api/extract/strategies/remote.py
@@ -40,7 +40,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
             raise ValueError("No PDF file found - conversion error.")
 
         try: 
-            url = os.getenv("REMOTE_API_URL", "")
+            url = os.getenv("REMOTE_API_URL", self._strategy_config.get("url"))
             if not url:
                 raise Exception('Please do set the REMOTE_API_URL environment variable: export REMOTE_API_URL=http://...')
             files = {'file': ('document.pdf', pdf_files[0].binary, 'application/pdf')}
diff --git a/text_extract_api/extract/strategies/strategy.py b/text_extract_api/extract/strategies/strategy.py
@@ -12,9 +12,14 @@
 
 class Strategy:
     _strategies: Dict[str, Strategy] = {}
+    _strategy_config: Dict[str, Dict] = {}
 
     def __init__(self):
         self.update_state_callback = None
+        self._strategy_config = None
+
+    def set_strategy_config(self, config: Dict):
+        self._strategy_config = config
 
     def set_update_state_callback(self, callback):
         self.update_state_callback = callback
@@ -88,8 +93,10 @@ def load_strategies_from_config(cls, path: str = os.getenv('OCR_CONFIG_PATH', 'c
             module = importlib.import_module(module_path)
 
             strategy = getattr(module, class_name)
-
-            cls.register_strategy(strategy(), strategy_name)
+            strategy_instance = strategy()
+            strategy_instance.set_strategy_config(strategy_config)
+            
+            cls.register_strategy(strategy_instance, strategy_name)
             print(f"Loaded strategy from {config_file_path} {strategy_name} [{strategy_class_path}]")
 
         return strategies