Skip to content

Commit 57a7cd9

Browse files
committed
#54 ocr_strategies -> ocr_strategy
1 parent 3fba346 commit 57a7cd9

File tree

11 files changed

+32
-33
lines changed

11 files changed

+32
-33
lines changed

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ SHELL := /bin/bash
33
export DISABLE_VENV ?= 0
44
export DISABLE_LOCAL_OLLAMA ?= 0
55

6-
76
.PHONY: help
87
help:
98
@echo "Available commands:"

config/ocr_strategies.yaml

Lines changed: 0 additions & 7 deletions
This file was deleted.

config/strategies.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
strategies:
2+
llama_vision:
3+
class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
4+
marker:
5+
class: text_extract_api.extract.strategies.marker.MarkerStrategy
6+
tesseract:
7+
class: text_extract_api.extract.strategies.tesseract.TesseractStrategy

run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ set -a; source .env.localhost; set +a
3535

3636
if [ "$DISABLE_LOCAL_OLLAMA" -eq 1 ]; then
3737
echo "Local Ollama disabled by env \`DISABLE_LOCAL_OLLAMA=$DISABLE_LOCAL_OLLAMA\`"
38-
echo "External Ollama should be listening on OLLAMA_HOST={$OLLAMA_HOST}"
38+
echo "External Ollama should be listening on OLLAMA_HOST=$OLLAMA_HOST"
3939
else
4040
echo "Starting Ollama Server"
4141
ollama serve &
File renamed without changes.

text_extract_api/extract/ocr_strategies/llama_vision.py renamed to text_extract_api/extract/strategies/llama_vision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
import ollama
66

7-
from text_extract_api.extract.ocr_strategies.ocr_strategy import OCRStrategy
7+
from text_extract_api.extract.strategies.strategy import Strategy
88
from text_extract_api.files.file_formats.file_format import FileFormat
99
from text_extract_api.files.file_formats.image import ImageFileFormat
1010

1111

12-
class LlamaVisionOCRStrategy(OCRStrategy):
12+
class LlamaVisionStrategy(Strategy):
1313
"""Llama 3.2 Vision OCR Strategy"""
1414

1515
@classmethod

text_extract_api/extract/ocr_strategies/marker.py renamed to text_extract_api/extract/strategies/marker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from marker.convert import convert_single_pdf
22
from marker.models import load_all_models
33

4-
from text_extract_api.extract.ocr_strategies.ocr_strategy import OCRStrategy
4+
from text_extract_api.extract.strategies.strategy import Strategy
55
from text_extract_api.files.file_formats.file_format import FileFormat
66
from text_extract_api.files.file_formats.pdf import PdfFileFormat
77

88

9-
class MarkerOCRStrategy(OCRStrategy):
9+
class MarkerStrategy(Strategy):
1010

1111
@classmethod
1212
def name(cls) -> str:

text_extract_api/extract/ocr_strategies/ocr_strategy.py renamed to text_extract_api/extract/strategies/strategy.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
from text_extract_api.files.file_formats.file_format import FileFormat
1111

12-
class OCRStrategy:
13-
_strategies: Dict[str, OCRStrategy] = {}
12+
class Strategy:
13+
_strategies: Dict[str, Strategy] = {}
1414

1515
def __init__(self):
1616
self.update_state_callback = None
@@ -31,7 +31,7 @@ def extract_text(cls, file_format: Type["FileFormat"]):
3131
raise NotImplementedError("Strategy subclasses must implement extract_text method")
3232

3333
@classmethod
34-
def get_strategy(cls, name: str) -> Type["OCRStrategy"]:
34+
def get_strategy(cls, name: str) -> Type["Strategy"]:
3535
"""
3636
Fetches and returns a registered strategy class based on the given name.
3737
@@ -58,13 +58,13 @@ def get_strategy(cls, name: str) -> Type["OCRStrategy"]:
5858
return cls._strategies[name]
5959

6060
@classmethod
61-
def register_strategy(cls, strategy: Type["OCRStrategy"], name: str = None, override: bool = False):
61+
def register_strategy(cls, strategy: Type["Strategy"], name: str = None, override: bool = False):
6262
name = name or strategy.name()
6363
if override or name not in cls._strategies:
6464
cls._strategies[name] = strategy
6565

6666
@classmethod
67-
def load_strategies_from_config(cls, path: str = os.getenv('OCR_CONFIG_PATH', 'config/ocr_strategies.yaml')):
67+
def load_strategies_from_config(cls, path: str = os.getenv('OCR_CONFIG_PATH', 'config/strategies.yaml')):
6868
strategies = cls._strategies
6969
project_root = os.path.dirname(os.path.dirname(os.path.abspath(path)))
7070
config_file_path = os.path.join(project_root, path)
@@ -75,10 +75,10 @@ def load_strategies_from_config(cls, path: str = os.getenv('OCR_CONFIG_PATH', 'c
7575
with open(config_file_path, 'r') as f:
7676
config = yaml.safe_load(f)
7777

78-
if 'ocr_strategies' not in config or not isinstance(config['ocr_strategies'], dict):
79-
raise ValueError(f"Missing or invalid 'ocr_strategies' section in the {config_file_path} file")
78+
if 'strategies' not in config or not isinstance(config['strategies'], dict):
79+
raise ValueError(f"Missing or invalid 'strategies' section in the {config_file_path} file")
8080

81-
for strategy_name, strategy_config in config['ocr_strategies'].items():
81+
for strategy_name, strategy_config in config['strategies'].items():
8282
if 'class' not in strategy_config:
8383
raise ValueError(f"Missing 'class' attribute for OCR strategy: {strategy_name}")
8484

@@ -109,7 +109,7 @@ def autodiscover_strategies(cls) -> Dict[str, Type]:
109109
continue
110110

111111
for submodule_info in pkgutil.walk_packages(module.__path__, module_info.name + "."):
112-
if ".ocr_strategies." not in submodule_info.name:
112+
if ".strategies." not in submodule_info.name:
113113
continue
114114

115115
try:
@@ -120,8 +120,8 @@ def autodiscover_strategies(cls) -> Dict[str, Type]:
120120
for attr_name in dir(ocr_module):
121121
attr = getattr(ocr_module, attr_name)
122122
if (isinstance(attr, type)
123-
and issubclass(attr, OCRStrategy)
124-
and attr is not OCRStrategy
123+
and issubclass(attr, Strategy)
124+
and attr is not Strategy
125125
and attr.name() not in strategies
126126
):
127127
strategies[attr.name()] = attr()

text_extract_api/extract/ocr_strategies/tesseract.py renamed to text_extract_api/extract/strategies/tesseract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
import numpy as np
33
import pytesseract
44

5-
from text_extract_api.extract.ocr_strategies.ocr_strategy import OCRStrategy
5+
from text_extract_api.extract.strategies.strategy import Strategy
66
from text_extract_api.files.file_formats.file_format import FileFormat
77
from text_extract_api.files.file_formats.image import ImageFileFormat
88

99

10-
class TesseractOCRStrategy(OCRStrategy):
10+
class TesseractStrategy(Strategy):
1111

1212
@classmethod
1313
def name(cls) -> str:

text_extract_api/extract/tasks.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import redis
77

88
from text_extract_api.celery_app import app as celery_app
9-
from text_extract_api.extract.ocr_strategies.ocr_strategy import OCRStrategy
9+
from text_extract_api.extract.strategies.strategy import Strategy
1010
from text_extract_api.files.file_formats.file_format import FileFormat
1111
from text_extract_api.files.storage_manager import StorageManager
1212

@@ -32,8 +32,8 @@ def ocr_task(
3232
"""
3333
start_time = time.time()
3434

35-
ocr_strategy = OCRStrategy.get_strategy(strategy_name)
36-
ocr_strategy.set_update_state_callback(self.update_state)
35+
strategy = Strategy.get_strategy(strategy_name)
36+
strategy.set_update_state_callback(self.update_state)
3737

3838
self.update_state(state='PROGRESS', status="File uploaded successfully",
3939
meta={'progress': 10}) # Example progress update
@@ -51,7 +51,7 @@ def ocr_task(
5151
self.update_state(state='PROGRESS',
5252
meta={'progress': 30, 'status': 'Extracting text from PDF', 'start_time': start_time,
5353
'elapsed_time': time.time() - start_time}) # Example progress update
54-
extracted_text = ocr_strategy.extract_text(FileFormat.from_binary(binary_content))
54+
extracted_text = strategy.extract_text(FileFormat.from_binary(binary_content))
5555
else:
5656
print("Using cached result...")
5757

0 commit comments

Comments
 (0)