feat: easyOCR implementation

pkarw · pkarw · commit fe37006043a2 · 2025-01-17T13:45:31.000+01:00
diff --git a/config/strategies.yaml b/config/strategies.yaml
@@ -3,5 +3,5 @@ strategies:
       class: text_extract_api.extract.strategies.llama_vision.LlamaVisionStrategy
    marker:
       class: text_extract_api.extract.strategies.marker.MarkerStrategy
-   tesseract:
-      class: text_extract_api.extract.strategies.tesseract.TesseractStrategy
+   easyocr:
+      class: text_extract_api.extract.strategies.easyocr.EasyOCRStrategy
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
     "fastapi",
+    "easyocr",
     "celery",
     "redis",
     "pytesseract",
diff --git a/text_extract_api/extract/strategies/easyocr.py b/text_extract_api/extract/strategies/easyocr.py
@@ -0,0 +1,56 @@
+import io
+import numpy as np
+from PIL import Image
+import easyocr
+
+from text_extract_api.extract.strategies.strategy import Strategy
+from text_extract_api.files.file_formats.file_format import FileFormat
+from text_extract_api.files.file_formats.image_file_format import ImageFileFormat
+
+
+class EasyOCR(Strategy):
+    @classmethod
+    def name(cls) -> str:
+        return "easyOCR"
+
+    def extract_text(self, file_format: FileFormat) -> str:
+        """
+        Extract text using EasyOCR after converting the input file to images
+        (if not already an ImageFileFormat). 
+        """
+
+        # Ensure we can actually convert the input file to ImageFileFormat
+        if (
+            not isinstance(file_format, ImageFileFormat) 
+            and not file_format.can_convert_to(ImageFileFormat)
+        ):
+            raise TypeError(
+                f"EasyOCR - format {file_format.mime_type} is not supported (yet?)"
+            )
+
+        # Convert the input file to a list of ImageFileFormat objects
+        images = FileFormat.convert_to(file_format, ImageFileFormat)
+
+        # Initialize the EasyOCR Reader
+        # Add or change languages to your needs, e.g., ['en', 'fr']
+        reader = easyocr.Reader(['en'])
+
+        # Process each image, extracting text
+        all_extracted_text = []
+        for image_format in images:
+            # Convert the in-memory bytes to a PIL Image
+            pil_image = Image.open(io.BytesIO(image_format.binary))
+            
+            # Convert PIL image to numpy array for EasyOCR
+            np_image = np.array(pil_image)
+
+            # Perform OCR; with `detail=0`, we get just text, no bounding boxes
+            ocr_result = reader.readtext(np_image, detail=0)
+
+            # Combine all lines into a single string for that image/page
+            extracted_text = "\n".join(ocr_result)
+            all_extracted_text.append(extracted_text)
+
+        # Join text from all images/pages
+        full_text = "\n\n".join(all_extracted_text)
+        return full_text
diff --git a/text_extract_api/extract/strategies/tesseract.py b/text_extract_api/extract/strategies/tesseract.py