Skip to content

Commit fe37006

Browse files
committed
feat: easyOCR implementation
1 parent 5101f4a commit fe37006

File tree

4 files changed

+59
-34
lines changed

4 files changed

+59
-34
lines changed

config/strategies.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ strategies:
33
class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
44
marker:
55
class: text_extract_api.extract.strategies.marker.MarkerStrategy
6-
tesseract:
7-
class: text_extract_api.extract.strategies.tesseract.TesseractStrategy
6+
easyocr:
7+
class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ readme = "README.md"
1212
requires-python = ">=3.8"
1313
dependencies = [
1414
"fastapi",
15+
"easyocr",
1516
"celery",
1617
"redis",
1718
"pytesseract",
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import io
2+
import numpy as np
3+
from PIL import Image
4+
import easyocr
5+
6+
from text_extract_api.extract.strategies.strategy import Strategy
7+
from text_extract_api.files.file_formats.file_format import FileFormat
8+
from text_extract_api.files.file_formats.image_file_format import ImageFileFormat
9+
10+
11+
class EasyOCR(Strategy):
12+
@classmethod
13+
def name(cls) -> str:
14+
return "easyOCR"
15+
16+
def extract_text(self, file_format: FileFormat) -> str:
17+
"""
18+
Extract text using EasyOCR after converting the input file to images
19+
(if not already an ImageFileFormat).
20+
"""
21+
22+
# Ensure we can actually convert the input file to ImageFileFormat
23+
if (
24+
not isinstance(file_format, ImageFileFormat)
25+
and not file_format.can_convert_to(ImageFileFormat)
26+
):
27+
raise TypeError(
28+
f"EasyOCR - format {file_format.mime_type} is not supported (yet?)"
29+
)
30+
31+
# Convert the input file to a list of ImageFileFormat objects
32+
images = FileFormat.convert_to(file_format, ImageFileFormat)
33+
34+
# Initialize the EasyOCR Reader
35+
# Add or change languages to your needs, e.g., ['en', 'fr']
36+
reader = easyocr.Reader(['en'])
37+
38+
# Process each image, extracting text
39+
all_extracted_text = []
40+
for image_format in images:
41+
# Convert the in-memory bytes to a PIL Image
42+
pil_image = Image.open(io.BytesIO(image_format.binary))
43+
44+
# Convert PIL image to numpy array for EasyOCR
45+
np_image = np.array(pil_image)
46+
47+
# Perform OCR; with `detail=0`, we get just text, no bounding boxes
48+
ocr_result = reader.readtext(np_image, detail=0)
49+
50+
# Combine all lines into a single string for that image/page
51+
extracted_text = "\n".join(ocr_result)
52+
all_extracted_text.append(extracted_text)
53+
54+
# Join text from all images/pages
55+
full_text = "\n\n".join(all_extracted_text)
56+
return full_text

text_extract_api/extract/strategies/tesseract.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

0 commit comments

Comments
 (0)