Skip to content

Commit b45c9fe

Browse files
committed
[feat]: generalized OllamaStrategy instead of MiniCPM/LlamaVision; added strategy config like prompt to the config file + added url for the RemoteStrategy so now few remote strategies could be defined within a single config with different URLs (yet the URL format is fixed for now)
1 parent 3947265 commit b45c9fe

File tree

10 files changed

+27
-88
lines changed

10 files changed

+27
-88
lines changed

.env.example

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
REDIS_CACHE_URL=redis://redis:6379/1
33
OLLAMA_HOST=http://ollama:11434
44
STORAGE_PROFILE_PATH=./storage_profiles
5-
LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
65
REMOTE_API_URL=
76

87
# CLI settings

.env.localhost.example

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#APP_ENV=production # sets the app into prod mode, othervise dev mode with auto-reload on code changes
22
REDIS_CACHE_URL=redis://localhost:6379/1
3-
LLAMA_VISION_PROMPT="You are OCR. Convert image to markdown."
43
DISABLE_LOCAL_OLLAMA=0
54
REMOTE_API_URL=
65

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ Set the Remote API Url:
223223
export REMOTE_API_URL=http://localhost:8002/marker/upload
224224
```
225225
226+
**Note: *** the URL might be also set via `/config/strategies.yaml` file
227+
226228
Run the `text-extract-api`:
227229
228230
```bash

config/strategies.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
strategies:
22
llama_vision:
3-
class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
3+
class: text_extract_api.extract.strategies.ollama.OllamaStrategy
4+
model: llama3.2-vision
5+
prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
46
minicpm_v:
5-
class: text_extract_api.extract.strategies.minicpm_v.MiniCPMVStrategy
7+
class: text_extract_api.extract.strategies.ollama.OllamaStrategy
8+
model: minicpm-v
9+
prompt: You are OCR. Convert image to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page.
610
easyocr:
711
class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
812
remote:
913
class: text_extract_api.extract.strategies.remote.RemoteStrategy
14+
url:

docker-compose.gpu.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ services:
1818
- LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
1919
- LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
2020
- DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
21-
- LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
2221
- REMOTE_API_URL=${REMOTE_API_URL}
2322
depends_on:
2423
- redis
@@ -45,7 +44,6 @@ services:
4544
- LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
4645
- LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
4746
- DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
48-
- LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
4947
depends_on:
5048
- redis
5149
- fastapi_app

docker-compose.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ services:
1818
- LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
1919
- LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
2020
- DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
21-
- LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
2221
- REMOTE_API_URL=${REMOTE_API_URL}
2322
depends_on:
2423
- redis
@@ -40,7 +39,6 @@ services:
4039
- LIST_FILES_URL=${LIST_FILES_URL-http://localhost:8000/storage/list}
4140
- LOAD_FILE_URL=${LOAD_FILE_URL-http://localhost:8000/storage/load}
4241
- DELETE_FILE_URL=${DELETE_FILE_URL-http://localhost:8000/storage/delete}
43-
- LLAMA_VISION_PROMPT=${LLAMA_VISION_PROMPT-"You are OCR. Convert image to markdown."}
4442
depends_on:
4543
- redis
4644
- fastapi_app

text_extract_api/extract/strategies/minicpm_v.py

Lines changed: 0 additions & 70 deletions
This file was deleted.

text_extract_api/extract/strategies/llama_vision.py renamed to text_extract_api/extract/strategies/ollama.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010
from text_extract_api.files.file_formats.image import ImageFileFormat
1111

1212

13-
class LlamaVisionStrategy(Strategy):
14-
"""Llama 3.2 Vision OCR Strategy"""
13+
class OllamaStrategy(Strategy):
14+
"""Ollama models OCR strategy"""
1515

1616
@classmethod
1717
def name(cls) -> str:
@@ -24,7 +24,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
2424
and not file_format.can_convert_to(ImageFileFormat)
2525
):
2626
raise TypeError(
27-
f"Llama Vision - format {file_format.mime_type} is not supported (yet?)"
27+
f"Ollama OCR - format {file_format.mime_type} is not supported (yet?)"
2828
)
2929

3030
images = FileFormat.convert_to(file_format, ImageFileFormat)
@@ -38,11 +38,12 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
3838
temp_file.write(image.binary)
3939
temp_filename = temp_file.name
4040

41-
# Generate text using the Llama 3.2 Vision model
41+
print(self._strategy_config)
42+
# Generate text using the specified model
4243
try:
43-
response = ollama.chat("llama3.2-vision", [{
44+
response = ollama.chat(self._strategy_config.get('model'), [{
4445
'role': 'user',
45-
'content': os.getenv('LLAMA_VISION_PROMPT', "You are OCR. Convert image to markdown."),
46+
'content': self._strategy_config.get('prompt'),
4647
'images': [temp_filename]
4748
}], stream=True)
4849
os.remove(temp_filename)
@@ -63,7 +64,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
6364
20 / num_pages) # 20% of work is for OCR - just a stupid assumption from tasks.py
6465
except ollama.ResponseError as e:
6566
print('Error:', e.error)
66-
raise Exception("Failed to generate text with Llama 3.2 Vision model")
67+
raise Exception("Failed to generate text with Ollama model " + self._strategy_config.get('model'))
6768

6869
print(response)
6970

text_extract_api/extract/strategies/remote.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def extract_text(self, file_format: FileFormat, language: str = 'en') -> Extract
4040
raise ValueError("No PDF file found - conversion error.")
4141

4242
try:
43-
url = os.getenv("REMOTE_API_URL", "")
43+
url = os.getenv("REMOTE_API_URL", self._strategy_config.get("url"))
4444
if not url:
4545
raise Exception('Please do set the REMOTE_API_URL environment variable: export REMOTE_API_URL=http://...')
4646
files = {'file': ('document.pdf', pdf_files[0].binary, 'application/pdf')}

text_extract_api/extract/strategies/strategy.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@
1212

1313
class Strategy:
1414
_strategies: Dict[str, Strategy] = {}
15+
_strategy_config: Dict[str, Dict] = {}
1516

1617
def __init__(self):
1718
self.update_state_callback = None
19+
self._strategy_config = None
20+
21+
def set_strategy_config(self, config: Dict):
22+
self._strategy_config = config
1823

1924
def set_update_state_callback(self, callback):
2025
self.update_state_callback = callback
@@ -88,8 +93,10 @@ def load_strategies_from_config(cls, path: str = os.getenv('OCR_CONFIG_PATH', 'c
8893
module = importlib.import_module(module_path)
8994

9095
strategy = getattr(module, class_name)
91-
92-
cls.register_strategy(strategy(), strategy_name)
96+
strategy_instance = strategy()
97+
strategy_instance.set_strategy_config(strategy_config)
98+
99+
cls.register_strategy(strategy_instance, strategy_name)
93100
print(f"Loaded strategy from {config_file_path} {strategy_name} [{strategy_class_path}]")
94101

95102
return strategies

0 commit comments

Comments
 (0)